链家爬虫代码_asyncio

#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time


loction_info = '''    1→杭州
    2→武汉
    3→北京
    按ENTER确认:'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
               '2':'wh',
               '3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限(万):')
up = input('请输入价格上限(万):')

inter_list = [(int(down),int(up))]

def half_inter(inter):
    lower = inter[0]
    upper = inter[1]
    delta = int((upper-lower)/2)
    inter_list.remove(inter)
    print('已经缩小价格区间',inter)
    inter_list.append((lower, lower+delta))
    inter_list.append((lower+delta, upper))

pagenum = {}
def get_num(inter):
    url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
    r = requests.get(url).text
    num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
    pagenum[(inter[0],inter[1])] = num
    return num

totalnum = get_num(inter_list[0])

judge = True
while judge:
    a = [get_num(x)>3000 for x in inter_list]
    if True in a:
        judge = True
    else:
        judge = False
    for i in inter_list:
        if get_num(i) > 3000:
            half_inter(i)
print('价格区间缩小完毕!')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
    totalpage = math.ceil(pagenum[i]/30)
    for j in range(1,totalpage+1):
        url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
        url_lst.append(url)
print('url列表获取完毕!')

info_lst = []
async def get_info(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=5) as resp:
            if resp.status != 200:
                url_lst_failed.append(url)
            else:
                url_lst_successed.append(url)
            r = await resp.text()
            nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
            # print('-------------------------------------------------------------')
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
            # print('-------------------------------------------------------------')
            info_dic = {}
            index = 1
            print('开始抓取{}'.format(resp.url))
            print('开始抓取{}'.format(resp.url))
            print('开始抓取{}'.format(resp.url))
            for node in nodelist:
                try:
                    info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
                except:
                    info_dic['title'] = '/'
                try:
                    info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
                except:
                    info_dic['href'] = '/'
                try:
                    info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
                except:
                    info_dic['xiaoqu'] = '/'
                try:
                    info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
                except:
                    info_dic['huxing'] = '/'
                try:
                    info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
                except:
                    info_dic['area'] = '/'
                try:
                    info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
                except:
                    info_dic['chaoxiang'] = '/'
                try:
                    info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
                except:
                    info_dic['zhuangxiu'] = '/'
                try:
                    info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
                except:
                    info_dic['dianti'] = '/'
                try:
                    info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['louceng'] = '/'
                try:
                    info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['nianxian'] = '/'
                try:
                    info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
                except:
                    info_dic['guanzhu'] = '/'
                try:
                    info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
                except:
                    info_dic['daikan'] = '/'
                try:
                    info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
                except:
                    info_dic['fabu'] = '/'
                try:
                    info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
                except:
                    info_dic['totalprice'] = '/'
                try:
                    info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价','')
                except:
                    info_dic['unitprice'] = '/'
                if True in [info_dic['href'] in dic.values() for dic in info_lst]:
                    url_lst_duplicated.append(info_dic)
                else:
                    info_lst.append(info_dic)
                print('第{}条:    {}→房屋信息抓取完毕!'.format(index,info_dic['title']))
                index += 1
                info_dic = {}

start = time.time()

#首次抓取url_lst中的信息,部分url没有对其发起请求,不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

#将没有发起请求的url放入一个列表,对其进行循环抓取,直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
    if url not in url_lst_successed or url_lst_failed:
        url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
    tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
    loop.run_until_complete(asyncio.wait(tasks_unrequested))
    url_lst_unrequested = []
    for url in url_lst:
        if url not in url_lst_successed:
            url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('总共耗时{}秒'.format(end-start))

df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')

##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
#     resp = requests.get(url)
#     nodelist = etree.HTML(resp.text).xpath("//ul[@class='sellListContent']/li")
#     info_dic = {}
#     index = 1
#     print('开始抓取{}'.format(resp.url))
#     print('开始抓取{}'.format(resp.url))
#     print('开始抓取{}'.format(resp.url))
#     for node in nodelist:
#         try:
#             info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
#         except:
#             info_dic['title'] = '/'
#         try:
#             info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
#         except:
#             info_dic['href'] = '/'
#         try:
#             info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 0]
#         except:
#             info_dic['xiaoqu'] = '/'
#         try:
#             info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 1]
#         except:
#             info_dic['huxing'] = '/'
#         try:
#             info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
#         except:
#             info_dic['area'] = '/'
#         try:
#             info_dic['chaoxiang'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
#         except:
#             info_dic['chaoxiang'] = '/'
#         try:
#             info_dic['zhuangxiu'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
#         except:
#             info_dic['zhuangxiu'] = '/'
#         try:
#             info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 5]
#         except:
#             info_dic['dianti'] = '/'
#         try:
#             info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['louceng'] = '/'
#         try:
#             info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['nianxian'] = '/'
#         try:
#             info_dic['guanzhu'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
#         except:
#             info_dic['guanzhu'] = '/'
#         try:
#             info_dic['daikan'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
#         except:
#             info_dic['daikan'] = '/'
#         try:
#             info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
#         except:
#             info_dic['fabu'] = '/'
#         try:
#             info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
#         except:
#             info_dic['totalprice'] = '/'
#         try:
#             info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
#         except:
#             info_dic['unitprice'] = '/'
#         if True in [info_dic['href'] in dic.values() for dic in info_lst]:
#             url_lst_duplicated.append(info_dic)
#         else:
#             info_lst.append(info_dic)
#         print('第{}条:    {}→房屋信息抓取完毕!'.format(index, info_dic['title']))
#         index += 1
#         info_dic = {}
# end = time.time()
# print('实际获得{}条房源信息。'.format(len(info_lst)))
# print('总共耗时{}秒'.format(end-start))
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 194,088评论 5 459
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 81,715评论 2 371
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 141,361评论 0 319
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 52,099评论 1 263
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 60,987评论 4 355
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 46,063评论 1 272
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 36,486评论 3 381
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 35,175评论 0 253
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 39,440评论 1 290
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 34,518评论 2 309
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 36,305评论 1 326
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 32,190评论 3 312
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 37,550评论 3 298
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 28,880评论 0 17
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 30,152评论 1 250
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 41,451评论 2 341
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 40,637评论 2 335

推荐阅读更多精彩内容