练习两层工作流
- 第一步,获取目标url存入数据库(mongoconn.py )
- 第二步,从数据库中读出url, 并从页面上提取目标信息(homework2_2.py )
源代码
mongoconn.py
#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# connect mongodb
import pymongo
def mongoset(db, table):
client = pymongo.MongoClient('localhost', 27017)
data = client[db]
sheet = data[table]
return sheet
def mongoinsert(table, data):
table.insert_many(data)
homework2_2.py
#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# 爬取手机号
# step1 get all urls, save them to db
# step2 get detail info by accessing those urls
from bs4 import BeautifulSoup
import requests
import time
from mongoconn import mongoset, mongoinsert
def get_soup(url):
source = requests.get(url)
soup = BeautifulSoup(source.text, 'lxml')
return soup
def combineurls(url, page):
pageurls = []
for i in range(1, page+1):
pageurl = '{}{}/'.format(url, i)
pageurls.append(pageurl)
return pageurls
def get_page_urls(url):
curpage = 1
maxpage=0
while curpage > maxpage:
maxpage = curpage
pageurl = url + 'pn' + str(maxpage)
soup = get_soup(pageurl)
pager = soup.select('div.pager > a')
pagenum = pager[len(pager)-3].select('span')[0].get_text() #### -3是临时办法, 需要再想想
curpage = int(pagenum)
urls = combineurls(url+'pn', maxpage)
return urls
def listtodict(urls):
datamany = []
for itemurl in urls:
data = {
'itemurl': itemurl
}
datamany.append(data)
return datamany
def get_item_urls(url):
soup = get_soup(url)
itemlist = soup.select('div.boxlist > ul > li > a.t')
itemurls = []
for item in itemlist:
try:
itemurl = item.get('href')
except:
pass
itemurls.append(itemurl)
time.sleep(1)
return itemurls
def getemtext(element):
return element.get_text().strip().replace('\t', '').replace('\n', '').replace(' ','')
def get_target_info(url):
soup = get_soup(item['url'])
main = soup.select('div.detailPrimary')
if main:
title = main[0].select('div.mainTitle h1')[0]
price = main[0].select('span.price')[0]
data = {
'title': getemtext(title),
'price': getemtext(price),
'url': url
}
return data
if __name__ == '__main__':
### step1, get urls and insert into mongo
table = mongoset('58sale', 'shoujihaourl')
url = 'http://bj.58.com/shoujihao/'
pageurls = get_page_urls(url)
for url in pageurls:
mongoinsert(table, listtodict(get_item_urls(url)))
### step2, get detailed info
table = mongoset('58sale', 'shoujihaourl')
tinfo = mongoset('58sale', 'shoujihaoinfo')
data = table.find()
for item in data:
info = get_target_info(item['url'])
if info:
if not tinfo.count({'url': item['url']}): #to filter out duplication info
print(info)
tinfo.insert_one(info)
time.sleep(1)
- 处理url与mongodb相关的操作分开。用from mongoconn import mongoset, mongoinsert引入mongo相关的函数。
运行结果
-
step1: ‘shoujihaourl’表中存储目标url
-
step2: 'shoujihaoinfo'表中存储提取到的详细信息
总结
- 分层处理爬取任务,将已经获取到的信息储存起来,省时省力