这次的作业不能说完整做出来了,不过剩下的都是网站的细节DEBUG
先帖代码
获取所有二手商品的代码(爬虫1)
import pymongo
import requests
from bs4 import BeautifulSoup
client=pymongo.MongoClient('localhost',27017)
ganji=client['ganji']
listAddress=ganji['listAddress']
def get_one_link(url):
wb_data = requests.get(url)
wb_data.encoding = 'utf-8'
soup = BeautifulSoup(wb_data.text,'lxml')
if False:#soup.select("head > script")[0].get('src').find("//j1.58cdn.com.cn/js/404/topbar404.js")>0:#如果是404页面(即找到了404.js)
return 0
else:
titles=soup.select("#wrapper > div.layout > div.side-area > div.main > ul > li > div > dl > dd > a")
links=soup.select("#wrapper > div.layout > div.side-area > div.main > ul > li > div > dl > dd > a")
datas=[]
for title,link in zip(titles,links):
data={
"title":title.text,
"link":"http://bj.ganji.com"+link.get('href')
}
# print(data)
# datas.append(data)
# print (data['link'])
# print (data['title'])
listAddress.insert_one(data)
return datas
get_one_link("http://bj.ganji.com/wu/")
得到的结果直接列出来了
mlinks='''
http://bj.ganji.com/ershoubijibendiannao/_macbook+pro/
http://bj.ganji.com/ershoubijibendiannao/_macbook/
http://bj.ganji.com/iphone/
http://bj.ganji.com/ipodTouch/
http://bj.ganji.com/iphone-iphone-4s/
http://bj.ganji.com/mi-hongmi/
http://bj.ganji.com/sanxingshouji-galaxy-s-iv/
http://bj.ganji.com/sanxingshouji-galaxy-note-iii/
http://bj.ganji.com/pingguo/
http://bj.ganji.com/lianxiang/
http://bj.ganji.com/thinkpad/
http://bj.ganji.com/suoni/
http://bj.ganji.com/daier/
http://bj.ganji.com/huashuo/
http://bj.ganji.com/ershoubijibendiannao/_New+iPad/
http://bj.ganji.com/ershoubijibendiannao/_%E4%B9%90Pad/
http://bj.ganji.com/psv/
http://bj.ganji.com/shuma/_%E4%BD%B3%E8%83%BD/
http://bj.ganji.com/shuma/_%E5%B0%BC%E5%BA%B7/
http://bj.ganji.com/shuangrenchuang/
http://bj.ganji.com/dianfengshan/
http://bj.ganji.com/tongche/
http://bj.ganji.com/qunzi/
http://bj.ganji.com/fangshaishuang/
http://bj.ganji.com/iphone/
http://bj.ganji.com/nokia/
http://bj.ganji.com/htc/
http://bj.ganji.com/sanxingshouji/
http://bj.ganji.com/motorola/
http://bj.ganji.com/suoniailixin/
http://bj.ganji.com/shouji/p1/
http://bj.ganji.com/shouji/b1000e1500/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/tongxuntaocan/
http://bj.ganji.com/qqhao/
http://bj.ganji.com/lanyaerji/
http://bj.ganji.com/shoujike/
http://bj.ganji.com/shoujidianchi/
http://bj.ganji.com/chongdianqi/
http://bj.ganji.com/pingbandiannao/z1/
http://bj.ganji.com/bijibendiannao/
http://bj.ganji.com/shangwangben/
http://bj.ganji.com/ershoubijibendiannao/_%E6%B8%B8%E6%88%8F%E6%9C%BA/
http://bj.ganji.com/ershoubijibendiannao/_%E5%95%86%E5%8A%A1%E6%9C%AC/
http://bj.ganji.com/bijibendiannao/b0e500/
http://bj.ganji.com/bijibendiannao/p3/
http://bj.ganji.com/ershoubijibendiannao/p4/
http://bj.ganji.com/taishidiannaozhengji/
http://bj.ganji.com/yitiji/
http://bj.ganji.com/fuwuqi/
http://bj.ganji.com/xianka/
http://bj.ganji.com/cpu/
http://bj.ganji.com/yingpan/
http://bj.ganji.com/xianshiqi/
http://bj.ganji.com/neicun/
http://bj.ganji.com/zhuban/
http://bj.ganji.com/wuxianluyouqi/
http://bj.ganji.com/yidongyingpan/
http://bj.ganji.com/diannaoyinxiang/
http://bj.ganji.com/dayinji/
http://bj.ganji.com/3gwangka/
http://bj.ganji.com/danfanxiangji/
http://bj.ganji.com/dandianxiangji/
http://bj.ganji.com/jingtou/
http://bj.ganji.com/shumashexiangji/
http://bj.ganji.com/yueqiyinxiang/
http://bj.ganji.com/ipodTouch/
http://bj.ganji.com/psp/
http://bj.ganji.com/ps3/
http://bj.ganji.com/zhanghaozhuangbei/
http://bj.ganji.com/chongzhidianka/
http://bj.ganji.com/qqhao/z1/
http://bj.ganji.com/chuangdian/
http://bj.ganji.com/guizi/
http://bj.ganji.com/zhuoyi/
http://bj.ganji.com/shafachaji/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/bangongjiaju/
http://bj.ganji.com/jiaju/_%E6%90%AC%E5%AE%B6/
http://bj.ganji.com/jiaju/p1/
http://bj.ganji.com/dianshi/
http://bj.ganji.com/bingxiang/
http://bj.ganji.com/kongtiao/
http://bj.ganji.com/reshuiqi/
http://bj.ganji.com/xiyiji/
http://bj.ganji.com/diancilu/
http://bj.ganji.com/weibolu/
http://bj.ganji.com/doujiangji/
http://bj.ganji.com/yueqiyinxiang/
http://bj.ganji.com/zixingchemaimai/
http://bj.ganji.com/diandongche/
http://bj.ganji.com/motuoche/
http://bj.ganji.com/sanlunche/
http://bj.ganji.com/anmobaojian/
http://bj.ganji.com/chuangshangyongpin/
http://bj.ganji.com/zhuangshibaishe/
http://bj.ganji.com/yingerche/
http://bj.ganji.com/niuniuche/
http://bj.ganji.com/xuebuche/
http://bj.ganji.com/ertonganquanzuoyi/
http://bj.ganji.com/yingerchuang/z1/
http://bj.ganji.com/niaobushi/
http://bj.ganji.com/naifen/
http://bj.ganji.com/tongche/
http://bj.ganji.com/tongzhuang/
http://bj.ganji.com/wanju/
http://bj.ganji.com/qunzi/
http://bj.ganji.com/gaogenxie/
http://bj.ganji.com/liangxie/
http://bj.ganji.com/shoubiao/
http://bj.ganji.com/shipin/
http://bj.ganji.com/lvxingxiang/
http://bj.ganji.com/danjianbao/
http://bj.ganji.com/shuangjianbao/
http://bj.ganji.com/shoutibao/
http://bj.ganji.com/xiangshui/
http://bj.ganji.com/fangshaishuang/
http://bj.ganji.com/huazhuangpin/
http://bj.ganji.com/hufupin/
http://bj.ganji.com/paobuji/
http://bj.ganji.com/yaling/
http://bj.ganji.com/yumaoqiuqicai/
http://bj.ganji.com/shoufuji/z1/
http://bj.ganji.com/jita/
http://bj.ganji.com/gangqin/
http://bj.ganji.com/dianziqin/
http://bj.ganji.com/zhuanyejishushuji/
http://bj.ganji.com/kaoshijiaofu/
http://bj.ganji.com/xiaoshuowenxue/
http://bj.ganji.com/jiguangyitiji/
http://bj.ganji.com/dayinji/
http://bj.ganji.com/bangongzhuo/
http://bj.ganji.com/diannaozhuo/
http://bj.ganji.com/huojiazhanjia/z1/
http://bj.ganji.com/jichuang/
http://bj.ganji.com/fengrenji/
http://bj.ganji.com/fadianji/
http://bj.ganji.com/shipinjiagongshebei/
http://bj.ganji.com/gongjuwujin/
http://bj.ganji.com/nongyongjixie/
http://bj.ganji.com/tuolaji/
http://bj.ganji.com/shucaishuiguo/
http://bj.ganji.com/miaomu/
http://bj.ganji.com/qitalipinzhuanrang/
http://bj.ganji.com/kalei/
http://bj.ganji.com/shoubiao/
http://bj.ganji.com/jiu/
http://bj.ganji.com/chaye/
http://bj.ganji.com/shougongshizixiu/
http://bj.ganji.com/youpiao/
http://bj.ganji.com/guwan/
http://bj.ganji.com/jinyinzhuyu/
http://bj.ganji.com/gongyipin/
http://bj.ganji.com/dongchongxiacao/
http://bj.ganji.com/kafeizhuanrang/
http://bj.ganji.com/haishen/
http://bj.ganji.com/rencan/
http://bj.ganji.com/fengmi/
http://bj.ganji.com/anmobaojian/z1/
http://bj.ganji.com/zibubaojian/z2/
http://bj.ganji.com/bawanwujian/
http://bj.ganji.com/zangao/
http://bj.ganji.com/taidixiong/
http://bj.ganji.com/jinmao/
http://bj.ganji.com/hashiqi/
http://bj.ganji.com/dog/
http://bj.ganji.com/chongwupeizhong/
http://bj.ganji.com/mao/
http://bj.ganji.com/longmao/
http://bj.ganji.com/jinli/
http://bj.ganji.com/yingwu/
http://bj.ganji.com/cangshu/
http://bj.ganji.com/qitaxiaochong/_%E5%85%94%E5%AD%90/
http://bj.ganji.com/gouwuka/
http://bj.ganji.com/daijinquan/
http://bj.ganji.com/jianshenka/
http://bj.ganji.com/youyongka/
http://bj.ganji.com/dianyingpiao/
http://bj.ganji.com/jingdianmenpiao/
http://bj.ganji.com/yanchanghuimenpiao/
http://bj.ganji.com/tiyusaishi/
'''
获取每个商品链接和商品详情的代码(爬虫2):
其中get_item_main是主函数,通过此函数调用其他函数
from get_list_address import mlinks,mtitles
from multiprocessing import Pool
import pymongo
import requests
from bs4 import BeautifulSoup
# import lxml
client=pymongo.MongoClient('localhost',27017)
ganji=client['ganji']
listAddress=ganji['listAddress']
sechandinfomation=ganji['sechandinfomation']
listAddress_complete=ganji['listAddress_complete']
# wb_data_ori = requests.get("http://bj.ganji.com/ershoubijibendiannao/2131174638x.htm")
# print(wb_data_ori.url)
def get_item_info(url):
wb_data_ori = requests.get(url)
if wb_data_ori.status_code==404:
pass
try:
soup = BeautifulSoup(wb_data_ori.text,'lxml')
time=soup.select("#wrapper > div.content.clearfix > div.leftBox > div.col-cont.title-box > div > ul.title-info-l.clearfix > li > i")[0].text.strip().replace("\n","").replace("\xa0","")
type=soup.select("#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li > span")[0].text.strip().replace(" ","").replace("\n","").replace("\xa0","")
price=soup.select("#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(2) > i.f22.fc-orange.f-type")[0].text.strip()
transaction_place=soup.select("#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(3)")[0].text.strip().replace(" ","").replace("\n","").replace("\xa0","")[5:]
# condition=soup.select("#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(4) > div.det-summary > div > div:nth-of-type(1) > ul.second-det-infor.clearfix > li:nth-of-type(1)")[0].text.strip().replace(" ","").replace("\n","").replace("\xa0","")[5:] if soup.find_all("新旧程度") else None
data={
"time":time,
"type":type,
"price":price,
"transaction_place":transaction_place,
# "condition":condition,
"link":wb_data_ori.url
}
# print(data)
return data
except Exception as err:
print(err)
print(url)
pass
def get_item_link(url,item):
try:
# item_sheet=ganji[item]
wb_data_ori = requests.get(url)
soup = BeautifulSoup(wb_data_ori.text,'lxml')
datas=[]
hrefs=soup.select("#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a")
for href in hrefs:
item_data=get_item_info(href.get('href'))
data={
"title":href.text.strip(),
"link":href.get('href'),
"time":item_data["time"],
"type":item_data["type"],
"price":item_data["price"],
"transaction_place":item_data["transaction_place"],
# "condition":item_data["condition"],
# "url":url
}
datas.append(data)
print(data)
sechandinfomation.insert_one(data)
# item_sheet.insert_one(data)
return datas
except Exception as err:
print(err)
print("url="+url)
pass
def get_item_nums(url,item):
wb_data_ori = requests.get(url)
soup = BeautifulSoup(wb_data_ori.text,'lxml')
return len(soup.select("#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a"))
#
#
# def get_item_main():
# for item in listAddress.find():
# n=1
# while(True):
# if get_item_nums(item['link']+"o"+str(n)+"/",item['title'])>=60:#组合添加o+数字的链接
# get_item_link(item['link']+"o"+str(n)+"/",item['title'])
# n+=1
# else:
# get_item_link(item['link']+"o"+str(n)+"/",item['title'])
# break
#
def get_item_main(url):
item=url[20:][:-1]#item是没用的,但是为了保证程序还能按照我的想法来改,就保留了
n=1
while(True):
if get_item_nums(url+"o"+str(n)+"/",item)>=60:#组合添加o+数字的链接
get_item_link(url+"o"+str(n)+"/",item)
n+=1
else:
get_item_link(url+"o"+str(n)+"/",item)
break
data=listAddress.find_one({"link":url})
listAddress_complete.insert_one(data)
调用多线程执行的函数:
记录执行到哪里的方法:直接用一个列表保存已经完成的商品种类,这样总的商品种类减去已经完成的商品种类,即可获得未完成的。
from multiprocessing import Pool
# from get_item_link import get_item_info,get_item_link,get_item_nums,get_item_main,client,walden,listAddress
# from get_list_address import get_one_link
import get_item_link
import get_list_address
from get_list_address import mlinks,mtitles
db_urls = [item['link'] for item in get_item_link.listAddress.find()]
index_urls = [item['link'] for item in get_item_link.listAddress_complete.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y
if __name__ == '__main__':
pool = Pool(processes=6)
# pool = Pool()
pool.map(get_item_link.get_item_main,mlinks.split())
pool.close()
pool.join()
总结来说,似乎跟老师的做法大同小异。
不过还有些小bug没时间改了。。比较可惜