导入必要的包
from pyspark import SparkConf, SparkContext
上传本地文件到hdfs
hadoop fs -copyFromLocal /data/data.txt /spark/
加载文件数据
lines = sc.textFile('hdfs://localhost:9000/spark/data.txt)
统计单词数量
wordCount = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1)).reduceByKey(lambda a, b : a + b)
wordCount.collect()
统计字符的数量
lineLengths = lines.map(lambda s:len(s))
lineLengths.reduce(lambda a, b: a + b)
统计包含特定字符的行的数量
lines.filter(lambda line: 'e' in line).count()
找出单行文本中包含单词数量的最大值
lines.map(lambda line: len(line.split(" "))).reduce(lambda a, b: (a > b and a or b))
持久化
slist = ["Hadoop", "Spark", "Hive"]
rdd = sc.parallelize(slist)
rdd.cache()
rdd.count()
','.join(rdd.collect())
详细代码见https://github.com/freedommay/notebook/tree/master/spark