说明:本文仅供初学者学习交流;请勿用作其他用途
1.分析过程
- 通过分析,我们可以发现除了北京以外,其他新房二手房url都有共同点,以上海为例,新房链接为https://sh.newhouse.fang.com/house/s/
二手房链接为https://sh.esf.fang.com/,只有城市简称部分不同,所以我们只需要找到所有城市列表就能实现爬取全部城市新房,二手房 - 进入房天下首页,查看更多城市
点击更多城市,出现城市列表就是我们需要的开始爬取页面,url为https://www.fang.com/SoufunFamily.htm
2.开始编码
以下部分直接上代码,基本上都是分析爬取信息的xpath的过程,熟练之后就会发现是一项体力活...
# -*- coding: utf-8 -*-
"""
items.py
"""
import scrapy
class NewHouseItem(scrapy.Item):
province = scrapy.Field()#省份
city = scrapy.Field()#城市
name = scrapy.Field()#名称
price = scrapy.Field()#价格
rooms = scrapy.Field()#几居室
ares = scrapy.Field()#面积
address = scrapy.Field()#地址
district = scrapy.Field()#区域
sale = scrapy.Field()#是否在售
origin_url = scrapy.Field()#原始url
class ESFHouseItem(scrapy.Item):
province = scrapy.Field()#省份
city = scrapy.Field()#城市
name = scrapy.Field()#名称
price = scrapy.Field()#总价
rooms = scrapy.Field()#几居室
floor = scrapy.Field()#层
toward = scrapy.Field()#朝向
year = scrapy.Field()#年代
ares = scrapy.Field()#面积
address = scrapy.Field()#地址
unit = scrapy.Field()#单价
origin_url = scrapy.Field()#原始url
以下是爬虫代码部分:
# -*- coding: utf-8 -*-
"""
soufang.py
"""
import re
import scrapy
from scrapy_redis.spiders import RedisSpider
from fang.items import NewHouseItem, ESFHouseItem
class SoufangSpider(RedisSpider):
name = 'soufang'
allowed_domains = ['fang.com']
# start_urls = ['https://www.fang.com/SoufunFamily.htm']
redis_key = "soufang:start_urls"
def parse(self, response):
trs = response.xpath("//div[@class='outCont']//tr")
province = ''
for tr in trs:
tds = tr.xpath(".//td[not(@class)]")
province_td = tds[0]
province_text = province_td.xpath(".//text()").get()
province_text = re.sub(r"\s", "", province_text)
if province_text:
province = province_text
if province == '其它':
continue
city_td = tds[1]
city_links = city_td.xpath(".//a")
for city_link in city_links:
city = city_link.xpath(".//text()").get()
city_url = city_link.xpath(".//@href").get()
url_module = city_url.split("//")
scheme = url_module[0]
domain = url_module[1]
if 'bj.' in domain:
newhouse_url = 'https://newhouse.fang.com/house/s/'
esf_url = 'http://esf.fang.com/'
else:
newhouse_url = scheme + '//' + 'newhouse.' + domain + 'house/s/'
esf_url = scheme + '//' + 'esf.' + domain
yield scrapy.Request(url=newhouse_url, callback=self.parse_newhouse, meta={"info": (province, city)})
yield scrapy.Request(url=esf_url, callback=self.parse_esf, meta={"info": (province, city)})
break
break
def parse_newhouse(self, response):
province, city = response.meta.get('info')
lis = response.xpath("//div[contains(@class, 'nl_con')]/ul/li")
for li in lis:
li_sect = li.xpath(".//div[@class='nlcd_name']/a/text()")
if not li_sect:
continue
name = li_sect.get().strip()
house_type = li.xpath(".//div[contains(@class, 'house_type')]/a/text()").getall()
rooms = '/'.join([item.strip() for item in house_type if item.endswith('居')]) or '未知'
ares = li.xpath("string(.//div[contains(@class, 'house_type')])").get()
ares = ares.split('-')[1].strip() if '-' in ares else '未知'
address = li.xpath(".//div[@class='address']/a/@title").get()
address_info = li.xpath("string(.//div[@class='address'])").get()
district = re.search(r'.*\[(.*)\].*', address_info).group(1)
sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
price = li.xpath("string(.//div[@class='nhouse_price'])").get().strip()
origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
item = NewHouseItem(name=name, rooms=rooms, ares=ares, address=address, district=district, sale=sale,
price=price, origin_url=origin_url, province=province, city=city)
yield item
next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
if next_url:
print('下一页:新房》》》', response.urljoin(next_url))
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse,
meta={"info": (province, city)})
else:
print("未找到下一页新房数据")
def parse_esf(self, response):
province, city = response.meta.get('info')
print(province, city)
dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
for dl in dls:
name = dl.xpath(".//span[@class='tit_shop']/text()").get()
infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
rooms, floor, toward, ares, year = '未知', '未知','未知','未知','未知'
for info in infos:
if '厅' in info:
rooms = info.strip()
elif '层' in info:
floor = info
elif '向' in info:
toward = info
elif '㎡' in info:
ares = info
elif '建' in info:
year = info
address=dl.xpath(".//p[@class='add_shop']/span/text()").get()
price = dl.xpath("string(.//dd[@class='price_right']/span[1])").get()
unit = dl.xpath("string(.//dd[@class='price_right']/span[2])").get()
detail_url = dl.xpath(".//p[@class='title']/a/@href").get()
origin_url = response.urljoin(detail_url)
item = ESFHouseItem(name=name, rooms=rooms, ares=ares, address=address, toward=toward, floor=floor,
price=price, origin_url=origin_url, province=province, city=city, year=year, unit=unit)
yield item
next_url = None
next_page_info = response.xpath("//div[@class='page_al']//p")
for info in next_page_info:
if info.xpath("./a/text()").get() == "下一页":
next_url = info.xpath("./a/@href").get()
print(next_url)
if next_url:
print('下一页:二手房》》》',response.urljoin(next_url))
yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf,
meta={"info": (province, city)})
else:
print("未找到下一页二手房数据")
加了一个请求头的中间件,里面有两种获取方式
# -*- coding: utf-8 -*-
"""
middlewares.py
"""
import random
from faker import Factory
from scrapy import signals
f = Factory.create()
class UserAgentDownloadMiddleWare(object):
#user-agent随机请求头中间件
USER_AGENTS = [
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENTS)
# user_agent = f.user_agent() #另外一种方式,需要安装faker库
print(user_agent)
request.headers['User-Agent'] = user_agent
setting部分
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
DOWNLOADER_MIDDLEWARES = {
'fang.middlewares.UserAgentDownloadMiddleWare': 543,
}
##########scrspy-redis setting##############
#确保request存储到redis中
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#确保所有爬虫共享相同的去重指纹
DUPEFILTER_CLASS ="scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES={
"scrapy_redis.pipelines.RedisPipeline":300
}
#实现暂停和恢复
SCHEDULER_PERSIST = True
REDIS_HOST='127.0.0.1' #redis数据库host
REDIS_PORT=6379 #redi数据库默认端口
#############################################
3 执行爬虫
前面我们在爬虫代码里面我们定义了一个redis的key:redis_key = "soufang:start_urls"
,用于告诉爬虫开始爬取的url。
- 进入爬虫目录spiders,执行命令
scrapy runspider soufang.py
,此时爬虫开始运行,但是会阻塞住,监听开始爬取的url,如下:
2.目前我只在windows上测试过爬取过程,结果是正常的,严格意义上分布式爬取应该是多台机器同时爬才能看到效果(打脸了。。),这里主要给大家看下思路,在本地windows安装redis,先后启动服务端redis-server.exe和客户端redis-cli.exe,在客户端push一个开始url进去,命令:lpush soufang:start_urls https://www.fang.com/SoufunFamily.htm
这里的soufang:start_urls
是前面soufang.py里面定义的key值。回车,此时可以看到前面阻塞的爬虫开始工作了