日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

读HDFS文件

發(fā)布時間:2024/9/21 编程问答 25 豆豆
生活随笔 收集整理的這篇文章主要介紹了 读HDFS文件 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

2019獨角獸企業(yè)重金招聘Python工程師標準>>>

不帶壓縮的讀取方式

hdfs = FileSystem.get(new URI("hdfs://SANDBOX-HADOOP-01.whh.net:8022"), conf, "bigdata"); package com.whh.bigdata.xetl.test;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream;import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.*;/*** Created by whh on 2017/9/29.*/public class ReadHDFS {private static final String utf8 = "UTF-8";/***讀取了HDFS上的一個txt文件里的內(nèi)容,按行讀取;返回一個map(行的hash值,行內(nèi)容),把重復(fù)的行打出來* @param txtFilePath* @param conf* @return*/public static Map<Integer,List<Integer>> getStringByTXT(String txtFilePath, Configuration conf){Map<Integer,List<Integer>> map = new HashMap<Integer,List<Integer>>();StringBuffer buffer = new StringBuffer();FSDataInputStream fsr = null;//輸入流BufferedReader bufferedReader = null;String lineTxt = null;try{FileSystem fs = FileSystem.get(URI.create(txtFilePath),conf);fsr = fs.open(new Path(txtFilePath));bufferedReader = new BufferedReader(new InputStreamReader(fsr));int lineCount = 1;while ((lineTxt = bufferedReader.readLine()) != null){int hc=lineTxt.hashCode();List <Integer>list=new ArrayList();list.add(lineCount);if (map.containsKey(hc)) //重復(fù)的行打印出來{System.out.println(lineCount +":" + lineTxt);map.get(hc).add(lineCount);}else map.put(hc,list);lineCount++;}} catch (Exception e){e.printStackTrace();} finally{if (bufferedReader != null){try{bufferedReader.close();} catch (IOException e){e.printStackTrace();}}}return map;}/*** @param args*/public static void main(String[] args) {// TODO Auto-generated method stubConfiguration conf = new Configuration();String txtFilePath = "hdfs://SANDBOX-HADOOP-01.whh.net:8022/log_data/stg_log_1600005/day=2017-11-19/-r-00001";// String mbline = getStringByTXT(txtFilePath, conf);Map <Integer,List<Integer>>map=new HashMap();map=getStringByTXT(txtFilePath, conf);for (Map.Entry<Integer, List<Integer>> entry : map.entrySet()){if(entry.getValue().size()>1){System.out.println(entry.getKey()+":"+entry.getValue());}}}}

帶壓縮的文件讀取

package com.whh.bigdata.xetl.test;import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream;import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.*;/*** Created by whh on 2017/9/29.*/public class ReadHDFS {private static final String utf8 = "UTF-8";/*** 獲取文件的壓縮方式* @param file* @return*/public String getCodec(Path file) {String filename = file.getName();String reversedFilename = (new StringBuilder(filename)).reverse().toString();return reversedFilename;}/*** 讀取壓縮文件的第一行,打印出來* @param txtFilePath* @param conf* @return*/public static String getStringByTXT1(String txtFilePath, Configuration conf){StringBuffer buffer = new StringBuffer();FSDataInputStream fsr = null;BufferedReader bufferedReader = null;String lineTxt = null;try{FileSystem fs = FileSystem.get(URI.create(txtFilePath),conf);fsr = fs.open(new Path(txtFilePath));CompressionCodecFactory factory = new CompressionCodecFactory(conf);CompressionCodec codec = factory.getCodec(new Path(txtFilePath));System.out.println("codec="+codec);CompressionInputStream compin=codec.createInputStream(fsr);//BufferedReader br= new BufferedReader(new InputStreamReader(compin));bufferedReader = new BufferedReader(new InputStreamReader(compin));while ((lineTxt = bufferedReader.readLine()) != null){//if(lineTxt.split("\t")[0].trim().equals("00067")){return lineTxt;//}}} catch (Exception e){e.printStackTrace();} finally{if (bufferedReader != null){try{bufferedReader.close();} catch (IOException e){e.printStackTrace();}}}return lineTxt;}public static void main(String[] args) {// TODO Auto-generated method stubConfiguration conf = new Configuration();String txtFilePath = "hdfs://SANDBOX-HADOOP-01.whh.net:8022/collect_data/userlog/20170925/kp_diag_2017092523_10.1.11.171.1506354616660.1549.log.gz";System.out.println(mbline);}}

轉(zhuǎn)載于:https://my.oschina.net/u/3267050/blog/1619492

總結(jié)

以上是生活随笔為你收集整理的读HDFS文件的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。