1.一个文件中,存储有10亿个单词(数字+字母组成,每个单词小于16Byte),每行一个,求出现频率最高的10个单词。
算法一:分而治之 + Hash
- 10亿个单词,不能完全加载到内存中处理
- 采用“分而治之”的思想,按照单词的hash值,将单词存储在多个文件中
- 对于每一个小文件,可以构建一个单词为key,出现次数为value的Hash map,同时记录当前出现次数最高的10个单词
- 可以得到多个小文件中的出现次数最多的10个单词,再依据常规的排序算法得到总体上出现次数最多的10个单词
//算法一:“分而治之 + Hash“的实现
package com.yunfeng;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BigDataSort{
public final static int FILE_NUM = 10;
public static Set<WordEntity> getTop10Word(File file) //获取单个文件中,出现频率最高的10个单词
{
int count = 0;
Map<String,Integer> map = new HashMap<String, Integer>();
Set<WordEntity> set = new TreeSet<WordEntity>();
Set<WordEntity> setTop10 = new TreeSet<WordEntity>();
try
{
BufferedReader br = new BufferedReader(new FileReader(file));
String s = null;
while ((s = br.readLine())!= null)
{
if (map.get(s) == null) {
count = 1;
} else {
count = map.get(s).intValue() + 1;
}
map.put(s,count);
}
for (String key : map.keySet()) {
set.add(new WordEntity(key,map.get(key)));
}
count = 1;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
setTop10.add(it.next());
if (count == 10)
break;
count++;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return setTop10;
}
public static File[] cutFileByHash(String file) {//切割大文件,file为包含大数据的文件
File[] files = new File[FILE_NUM];
BufferedWriter[] brs = new BufferedWriter[FILE_NUM];
try
{
BufferedReader br = new BufferedReader(new FileReader(file));
for(int i = 0;i < FILE_NUM;i++) {
files[i] = new File("file_" + i);
brs[i] = new BufferedWriter(new FileWriter(files[i]));
}
int j = 0;//取模后的值
String s = null;
while ((s = br.readLine())!= null)
{
j = Math.abs(s.hashCode() % FILE_NUM);
brs[j].write(s);
brs[j].newLine();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
for(BufferedWriter b : brs) {
b.close();
}
}
catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
}
return files;
}
public static void main(String[] args) {
File[] files = cutFileByHash("123.txt");//传入文件名字
Set<WordEntity> set = new TreeSet<WordEntity>();
for(File file : files) {
set.addAll(getTop10Word(file));
}
int count = 1;
WordEntity wordEntity = null;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
wordEntity = it.next();
System.out.println("The top ten word is :" + wordEntity.getKey()
+ ";frequence is " + wordEntity.getCount());
if (count == 10)
break;
count++;
}
}
}
class WordEntity implements Comparable<WordEntity> {//单词实体
private String key;
private Integer count;
public WordEntity (String key, Integer count) {
this.key = key;
this.count = count;
}
public int compareTo(WordEntity o) {
int cmp = count.intValue() - o.count.intValue();
return (cmp == 0 ? key.compareTo(o.key) : -cmp);
}
public String getKey(){
return key;
}
public Integer getCount(){
return count;
}
}