Scala版本的单词计数
//此处有个大大的问题???现在只能用local模式运行,想放在集群上还不行!!!
object Spark {
def main(args :Array[String]): Unit ={
//创建一个scala版本的SparkContext
val conf =new SparkConf().setAppName("wordCount").setMaster("local")
//val conf =new SparkConf().setAppName("wordCount")
val sc=new SparkContext(conf)
val input =sc.textFile("hdfs://192.168.1.200:9000/input01")
val words =input.flatMap(line => line.split(" "))
val counts=words.map(word => (word,1)).reduceByKey{case(x,y) => x + y }
counts.saveAsTextFile("hdfs://192.168.1.200:9000/output05")
}
}
传递函数引用对象会影响效率
class SearchFunctions(val query: String) {
def isMatch(s: String): Boolean = {
s.contains(query)
}
def getMatchesFunctionReference(rdd: RDD[String]): RDD[String] = {
//!!!isMatch 指向this.isMatch,因此应用的是整个对象
rdd.filter(isMatch)
}
def getMatchesFieldReference(rdd: RDD[String]): RDD[String] = {
//!!!query 指向this,因此应用的是整个对象
rdd.flatMap(x => x.split(query))
}
def getMatchesNoReference(rdd: RDD[String]): RDD[String] = {
//定义局部变量,解决对象引用问题
val query_ = query
rdd.flatMap(x => x.split( query_ ))
}
}