Python爬虫:爬取JS加载数据的网页

比如简书:

Paste_Image.png

我们来写个程序，爬取简书网站随便一个作者的所有文章，再对其所有文章进行分词统计
程序运行统计的结果见文章:
我统计了彭小六简书360篇文章中使用的词语

需要的Python包

包名	作用
selenium	用于和phantomjs合作模拟浏览器访问网页
lxml	用于对html页面的解析，提取数据
jieba	用于对文章正文分词
tld	解析url，比如提取domain

还需要下载 phantomjs，selenium配合phantomjs的使用代码中有体现
下载地址: http://phantomjs.org/

下面代码中，由于使用文件保存数据，而没有使用数据库保存数据，所以代码量比较多，其中主要代码并不多

直接上代码####

# -*-coding:utf-8-*-
import json
import os, sys
from random import randint
from collections import Counter
import jieba
from lxml import etree
from selenium import webdriver
import time
from tld import get_tld
path = os.path.abspath(os.path.dirname(__file__))

class Spider():
    '''
    获取简书作者的全部文章页面，并解析
    '''
    def __init__(self, start_url):
        '''
        我这里使用文件保存数据，没有使用数据库保存数据
        所有需要初始化文件保存路径
        使用本程序的你可以把文件保存改成数据库保存,建议使用nosql方便保存
        start_url:作者文章列表页面，比如http://www.jianshu.com/u/65fd4e5d930d
        :return:
        '''
        self.start_url = start_url
        res = get_tld(self.start_url, as_object=True, fix_protocol=True)
        self.domain = "{}.{}".format(res.subdomain, res.tld)

        self.user_id = self.start_url.split("/")[-1]
       
        # 保存作者文章列表html页面
        post_list_dir = '{}/post-list'.format(path)
        self.post_lists_html = '{}/post_list_{}.html'.format(post_list_dir, self.user_id)
        # 保存作者所有文章的url
        self.post_lists_urls = '{}/urls_{}.dat'.format(post_list_dir, self.user_id)
        # 保存文章原始网页：
        self.posts_html_dir = '{}/post-html/{}'.format(path, self.user_id)
        # 保存文章解析后的内容：
        self.posts_data_dir = '{}/post-data/{}'.format(path,self.user_id)
        # 保存文章统计后的结果：
        self.result_dir = '{}/result'.format(path)

        self.executable_path='{}/phantomjs-2.1.1-linux-x86_64/bin/phantomjs'.format(path)
        # mkdir
        if not os.path.exists(self.posts_html_dir):
            os.makedirs(self.posts_html_dir)
        if not os.path.exists(self.posts_data_dir):
            os.makedirs(self.posts_data_dir)
        if not os.path.exists(post_list_dir):
            os.makedirs(post_list_dir)
        if not os.path.exists(self.result_dir):
            os.makedirs(self.result_dir)
        # 网上随笔找的免费代理ip
        self.ips = ['61.167.222.17:808','58.212.121.72:8998', '111.1.3.36:8000', '125.117.133.74:9000']

    def post_list_page(self):
        '''
        获取文章列表页面，以及文章链接
        :return:
        '''
        obj = webdriver.PhantomJS(executable_path=self.executable_path)
        obj.set_page_load_timeout(30)
        obj.maximize_window()
        # 随机一个代理ip
        ip_num = len(self.ips)
        ip = self.ips[randint(0,ip_num-1)]
        obj.http_proxy = ip
    
        obj.get(self.start_url)

        # 文章总数量
        sel = etree.HTML(obj.page_source)
        r = sel.xpath("//div[@class='main-top']//div[@class='info']//li[3]//p//text()")
        if r:
            crawl_post_n = int(r[0])
        else:
            print("[Error] 提取文章总书的xpath不正确")
            sys.exit()
        n = crawl_post_n/9
        i = 1
        while n:
            t = randint(2,5)
            time.sleep(t)
            js = "var q=document.body.scrollTop=100000"
            # 页面一直下滚
            obj.execute_script(js)
            n -= 1
            i += 1
        # 然后把作者文章列表页面的html（保存到数据库，或文本保存）
        of = open(self.post_lists_html, "w")
        of.write(obj.page_source)
        of.close()

        # 我们也顺便把作者所有的文章链接提取出来（保存到数据库，或文本保存）
        of = open(self.post_lists_urls, "w")
        sel = etree.HTML(obj.page_source)
        results = sel.xpath("//div[@id='list-container']//li//a[@class='title']/@href")
        for result in results:
            of.write("http://{}{}".format(self.domain, result.strip()))
            of.write("\n")
        of.close()

    def posts_html(self):
        '''
        获取文章页面html
        :return:
        '''
        of = open(self.post_lists_urls)
        urls = of.readlines()
        
        ip_num = len(self.ips)
        obj = webdriver.PhantomJS(executable_path=self.executable_path)
        obj.set_page_load_timeout(10)
        obj.maximize_window()
        for url in urls:
            # 随机一个代理ip
            ip = self.ips[randint(0,ip_num-1)]
            obj.http_proxy = ip
            url = url.strip()
            print("代理ip:{}".format(ip))
            print("网页:{}".format(url))

            try:
                obj.get(url)
            except:
                print("Error:{}".format(url))

            post_id = url.split("/")[-1]
            of = open("{}/{}_{}.html".format(self.posts_html_dir, obj.title, post_id), "w")
            of.write(obj.page_source)
            of.close()
            t = randint(1,5)
            time.sleep(t)

    def page_parsing(self):
        '''
        html解析
        :return:
        '''
        # 只获取匹配的第一个
        xpath_rule_0 ={
            "author":"//div[@class='author']//span[@class='name']//text()", # 作者名字
            "author_tag":"//div[@class='author']//span[@class='tag']//text()",# 作者标签
            "postdate":"//div[@class='author']//span[@class='publish-time']//text()", # 发布时间
            "word_num":"//div[@class='author']//span[@class='wordage']//text()",#字数
            "notebook":"//div[@class='show-foot']//a[@class='notebook']/span/text()",#文章属于的目录
            "title":"//div[@class='article']/h1[@class='title']//text()",#文章标题
        }
        # 获取匹配的所有,并拼接成一个字符串的
        xpath_rule_all_tostr ={
            "content":"//div[@class='show-content']//text()",#正文
        }
        # 获取匹配的所有,保存数组形式
        xpath_rule_all ={
            "collection":"//div[@class='include-collection']//a[@class='item']//text()",#收入文章的专题
        }
        # 遍历所有文章的html文件，如果保存在数据库的则直接查询出来
        list_dir =  os.listdir(self.posts_html_dir)
        for file in list_dir:
            file = "{}/{}".format(self.posts_html_dir, file)
            if os.path.isfile(file):
                of = open(file)
                html = of.read()
                sel = etree.HTML(html)
                of.close()

                # 解析
                post_id = file.split("_")[-1].strip(".html")
                doc = {'url':'http://{}/p/{}'.format(self.domain,post_id)}
                for k,rule in xpath_rule_0.items():
                    results = sel.xpath(rule)
                    if results:
                        doc[k] = results[0]
                    else:
                        doc[k] = None

                for k,rule in xpath_rule_all_tostr.items():
                    results = sel.xpath(rule)
                    if results:
                        doc[k] = ""
                        for result in results:
                            if result.strip():
                                doc[k] = "{}{}".format(doc[k], result)
                    else:
                        doc[k] = None

                for k,rule in xpath_rule_all.items():
                    results = sel.xpath(rule)
                    if results:
                        doc[k] = results
                    else:
                        doc[k] = None
                if doc["word_num"]:
                    doc["word_num"] = int(doc["word_num"].strip('字数').strip())
                else:
                    doc["word_num"] = 0

                # 保存到数据库或者文件中

                of = open("{}/{}.json".format(self.posts_data_dir, post_id), "w")
                of.write(json.dumps(doc))
                of.close()

    def statistics(self):
        '''
        分开对每篇文章的进行分词统计，也统计全部文章分词
        :return: 
        '''
        # 遍历所有文章的html文件，如果保存在数据库的则直接查询出来
        word_sum = {} #正文全部词语统计
        title_word_sum = {} #标题全部词语统计
        post_word_cnt_list = [] #每篇文章使用的词汇数量

        # 正文统计数据保存
        list_dir = os.listdir(self.posts_data_dir)
        for file in list_dir:
            file = "{}/{}".format(self.posts_data_dir, file)
            if os.path.isfile(file):

                of = open(file)
                str = of.read()
                doc = json.loads(str)
                # 正文统计：精确模式,默认hi精确模式，所以可以不指定cut_all=False
                words = jieba.cut(doc["content"], cut_all=False)
                data = dict(Counter(words))
                data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True)
                word_cnt = 0
                for w in data:
                    # 只统计超过1个字的词语
                    if len(w[0]) < 2:
                        continue
                    # 统计到全部文章词语中
                    if w[0] in word_sum:
                        word_sum[w[0]]["cnt"] += w[1]
                        word_sum[w[0]]["post_cnt"] += 1
                    else:
                        word_sum[w[0]] = {}
                        word_sum[w[0]]["cnt"] = w[1]
                        word_sum[w[0]]["post_cnt"] = 1

                    word_cnt += 1

                post_word_cnt_list.append((word_cnt,
                                           doc["postdate"],
                                           doc["title"],
                                           doc["url"]))

                # 标题统计：精确模式,默认hi精确模式，所以可以不指定cut_all=False
                words = jieba.cut(doc["title"], cut_all=False)
                data = dict(Counter(words))
                data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True)
                for w in data:
                    # 只统计超过1个字的词语
                    if len(w[0]) < 2:
                        continue
                    # 统计到全部文章词语中
                    if w[0] in title_word_sum:
                        title_word_sum[w[0]]["cnt"] += w[1]
                        title_word_sum[w[0]]["post_cnt"] += 1
                    else:
                        title_word_sum[w[0]] = {}
                        title_word_sum[w[0]]["cnt"] = w[1]
                        title_word_sum[w[0]]["post_cnt"] = 1

                post_word_cnt_list = sorted(post_word_cnt_list, key=lambda d: d[0], reverse=True)
        wf = open("{}/content_statis_{}.dat".format(self.result_dir, self.user_id), "w")
        wf.write("| 词语 | 发布日期 | 标题 | 链接 |\n")
        for pw in post_word_cnt_list:
            wf.write("|　{} | {} | {}| {}|\n".format(pw[0],pw[1],pw[2],pw[3]))
        wf.close()

        # 全部文章正文各词语 按使用次数 统计结果
        wf = open("{}/content_statis_sum_use-num_{}.dat".format(self.result_dir, self.user_id), "w")
        word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1]['cnt'], reverse=True)
        wf.write("| 分词 | 使用次数 | 使用的文章数量|\n")
        for w in word_sum_t:
            wf.write("| {} | {} | {}|\n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))
        wf.close()

        # 全部文章正文各词语 按使用文章篇数 统计结果
        wf = open("{}/content_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")
        word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1]['post_cnt'], reverse=True)
        wf.write("| 分词 | 使用的文章数量 | 使用次数 |\n")
        for w in word_sum_t:
            wf.write("| {} | {} | {}|\n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))
        wf.close()


        # 全部文章title各词语 按使用次数 统计结果
        wf = open("{}/title_statis_sum_use-num_{}.dat".format(self.result_dir,self.user_id), "w")
        title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1]['cnt'], reverse=True)
        wf.write("| 分词 | 使用次数 | 使用的文章数量|\n")
        for w in title_word_sum_t:
            wf.write("| {} | {} | {}|\n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))
        wf.close()

        # 全部文章title各词语 按使用次数 统计结果
        wf = open("{}/title_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")
        title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1]['post_cnt'], reverse=True)
        wf.write("| 分词 | 使用的文章数量 | 使用次数 |\n")
        for w in title_word_sum_t:
            wf.write("| {} | {} | {}|\n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))
        wf.close()
        print("一共统计文章：{}　篇".format(len(list_dir)))
        print("所有正文－使用了２字及以上词语：{}　个".format(len(word_sum_t)))
        print("所有标题－使用了２字及以上词语：{}　个".format(len(title_word_sum_t)))

if __name__ == '__main__':
    sp = Spider(start_url="http://www.jianshu.com/u/65fd4e5d930d")
    print("获取作者文章列表页面...")
    sp.post_list_page()
    print("获取作者所有文章页面...")
    #sp.posts_html()
    print("解析作者所有文章页面...")
    #sp.page_parsing()
    print("简单统计分析文章词汇...")
    #sp.statistics()

程序运行统计的结果见文章: 我统计了彭小六简书360篇文章中使用的词语

最后编辑于：2017.12.07 03:38:37

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 204,189评论 6赞 478
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 85,577评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 150,857评论 0赞 337
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,703评论 1赞 276
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,705评论 5赞 366
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,620评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,995评论 3赞 396
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,656评论 0赞 258
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,898评论 1赞 298
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,639评论 2赞 321
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,720评论 1赞 330
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,395评论 4赞 319
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,982评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,953评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,195评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 44,907评论 2赞 349
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,472评论 2赞 342

Python爬虫:爬取JS加载数据的网页

直接上代码####

推荐阅读更多精彩内容