Python实战计划 第一周大作业-58同城
要求:
1.爬取页面http://bj.58.com/pbdn/0/ 的列表信息(除转转和推广信息外),然后获取详情信息,如:类目、标题、发贴时间、价格、成色、区域、浏览量等
2.注意浏览量的获取方法
执行结果:
代码如下:
from bs4 import BeautifulSoup
import requests
import time
url ='http://bj.58.com/pbdn/0/'
url_links = []
data = []
def get_url(url):
msg = requests.get(url)
soup = BeautifulSoup(msg.text, 'lxml')
links = soup.select('td.t a.t')
for link in links:
try:
if link.attrs['data-addtype']:
pass
except:
if link.attrs['href'].find('zhuanzhuan') >= 0: #find不抛出异常,如果用index,则会抛出异常
pass
else:
#print(link.attrs['href'])
url_links.append(link.attrs['href'])
def get_msginfo(url):
print("msginfo:" + url)
msg = requests.get(url)
soup = BeautifulSoup(msg.text, 'lxml')
#类目
type = soup.select('#header > div.breadCrumb.f12 ')[0].text.split()
#标题
title = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')[0].text
#发贴时间
ftime = soup.select('#index_show > ul.mtit_con_left.fl > li.time')[0].text
#价格
price = soup.select('ul > li > div.su_con > span.price')[0].text
#成色
purity = soup.select('ul > li > div.su_con > span')[1].text.strip()
#区域
if len(soup.select('.c_25d')) == 0:
area = None
else:
area = soup.select('.c_25d')[0].text.replace('-', '').split()
#浏览量
view = get_view(url)
date = {
'type':type,
'title':title,
'ftime':ftime,
'price':price,
'purity':purity,
'area':area,
'view':view
}
print(date)
data.append(date)
def get_view(url):
headers ={
'Referer':url
}
viwe_url = 'http://jst1.58.com/counter?infoid={}'.format(str(url.split('x.shtml')[0].split('/')[-1]))
msg = requests.get(viwe_url, headers=headers)
return msg.text.split('=')[-1]
get_url(url)
for url_link in url_links:
time.sleep(2)
get_msginfo(url_link)
总结:
1.通过一周的学习,已熟练掌握requests、bs4库的使用,学会了网页要素的提取,能用多种方法进行数据筛选
2.通过大作业,学会了商品过滤、js异步加载的页面分析,及简单的反爬技巧,对http协议有了更多的认识