爬取赶集网二手交易市场所有类目,并将信息储存在数据中。
效果是这样的:
我的代码:
#建立channel_list.py文件获取所有类目的访问链接
import requests
from bs4 import BeautifulSoup
start_url='http://bj.ganji.com/wu/'
def get_channel_list(url):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
channels=soup.select('dl.fenlei dt a')
# print(channels)#返回的是列表
for channel in channels:
base_url = 'http://bj.ganji.com'
residue_url=channel.get('href')
full_url=base_url+residue_url
print(full_url)
get_channel_list(start_url)
channel_list='''
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
'''
#建立link_list_detail_info.py文件获取每个类目的所有链接存放入数据库'linklists'及将每个类目的具体产品信息存放在'detailinfo'
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import random
client=MongoClient('localhost',27017)
ganjiDB=client['ganjiDB']
linklists=ganjiDB['linklists']
detailinfo=ganjiDB['detailinfo']
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
proxy_list=[
'http://125.88.74.122:83',
'http://113.18.193.5:8080',
'http://113.18.193.7:8080',
'http://120.92.3.127:90'
]
proxy_ip=random.choice(proxy_list)
proxies={'http':proxy_ip}#启用代理,规避赶集网针对单个IP的访问限制
def page_link(channel):
for cate in range(1,3):
for page in range(1,101):
link_url = ['{}a{}o{}'.format(channel, cate, page)][0]
#print(link_url)
link_list(link_url)
def link_list(url):
time.sleep(2)
web_data=requests.get(url,headers=headers)
# print(web_data.status_code)#返回结果code 200
soup=BeautifulSoup(web_data.text,'lxml')
# mark=soup.find('a','next')#返回结果为字符串<a class="next"href="/jiaju/a1o31/"><span>下一页</span></a>
# print(mark)
if soup.find('a','next')and url.split('/')[-1][1]=='1':#满足两个条件1、当前页不是最后一页2、当前页属于个人类目
lists=soup.select('td.t a.t')#与商家类目过滤条件不同
# print(lists)
for list in lists:
list_href=list.get('href').split('?')[0]
linklists.insert_one({'list_href':list_href})
print(list_href)
elif soup.find('a', 'next') and url.split('/')[-1][1] == '2':#满足两个条件1、当前页不是最后一页2、当前页属于商家类目
lists = soup.select('a.ft-tit')#与个人列木过滤条件不同
# print(lists)
for list in lists:
list_href = list.get('href')
linklists.insert_one({'list_href': list_href})
print(list_href)
else:
print('列表地址错误')
#获取每个页面的具体信息
def get_detail_info(url):
web_data=requests.get(url,headers=headers)
soup=BeautifulSoup(web_data.text,'lxml')
if url[-5]=='x':
info={
'title':soup.select('h1.title-name')[0].text,
'date':soup.select('i.pr-5')[0].text.strip(),
'types':soup.select('ul > li > span > a')[5].text,
'price':soup.select('i.f22.fc-orange.f-type')[0].text,
'area':list(map(lambda x:x.text,soup.select('div > div > div > div > ul > li > a')[-3:-1])),
'url':url
}
detailinfo.insert_one(info)
print(info)
elif url[-7]=='z':
info={
'title':soup.select('h1.info_titile')[0].text,
'price':soup.select('span.price_now i')[0].text,
'area':soup.select('div.palce_li span i')[0].text,
'url':url
}
detailinfo.insert_one(info)
print(info)
else:
print('地址错误')
#建立main.py文件调用channel_list.py、link_list_detail_info.py中的属性和方法及数据库信息
from channel_list import channel_list
from link_list_detail_info import linklists,page_link,link_list
from link_list_detail_info import detailinfo,get_detail_info
from multiprocessing import Pool
import time
def get_all_links(channel):
page_link(channel)
db_urls=set([item['list_href'] for item in linklists.find()])
index_urls=set([item['url'] for item in detailinfo.find()])
rest_of_url=db_urls-index_urls#断点续传
if __name__=='__main__':
pool=Pool()
pool.map(get_all_links,channel_list.split())#调用channel_list
time.sleep(10)
pool.map(get_detail_info,rest_of_url)#调用rest_of_url中每个类目下具体页面链接获取页面详情并进行断点续传优化
#建立count.py文件实时监控存入linklists中链接数量
from link_list_detail_info import linklists
import time
while True:
print(linklists.find().count())
time.sleep(10)
监控截图:
总结:
- Mongodb数据的基础功能使用;
- 多进程访问方式的引用;
- 数据库查找的灵活调用实现断点续传;
- map、lambda函数的使用;
- proxy及headers防爬机制的使用。