成果:
任务:
Level 1
爬取网页:http://sh.xiaozhu.com/fangzi/1650345535.html 中:
1、标题————title;
2、地址————address;
3、日租金————rent;
4、第一张房源图片链接————housePic;
5、房东图片链接————landlordPic;
6、房东名字————landlordName
7、房东性别————sex
Level 2
抓取300个房源详情:http://sh.xiaozhu.com/search-duanzufang-p1-0/
代码:
from bs4 import BeautifulSoup
import requests
def SexJudge(sex):
if sex == 'member_ico':
sex = 'man'
else:
sex = 'woman'
return sex
def GetInfo(houseUrl):
wb_data = requests.get(houseUrl)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('head > title')
addresses = soup.select('span.pr5')
rents = soup.select('#pricePart > div.day_l > span')
housePics = soup.select('#curBigImage')
landlordPics = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
sexes = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
for title,address,rent,housePic,landlordPic,sex in zip(titles,addresses,rents,housePics,landlordPics,sexes):
data = {
'title':title.get_text(),
'address':address.get_text(),
'rent':rent.get_text(),
'housePic':housePic.get('src'),
'landlordPic':landlordPic.get('src'),
'sex':SexJudge(''.join(sex.get('class'))),
}
print(data)
return data
menuUrl = ['http://sh.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,14)]
def GetHouseUrl(menuUrl):
data = []
n = 0
for url in menuUrl:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
houseUrls = soup.select('#page_list > ul > li > a')
for houseUrl in houseUrls:
data.insert(-1,houseUrl.get('href'))
n = n+1
print('Complete Page ',n)
return data
n=0
for houseUrl in GetHouseUrl(menuUrl):
GetInfo(houseUrl)
n = n+1
print('Complete House ',n)