get_Href
from getPageInformation import *
from getMainPageInformation import *
from bs4 import BeautifulSoup
import requests
import time
url1='http://bj.xiaozhu.com/'
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,20,1)]
sourceData=[]
hreflist=MainPageInformation(url1)
for i in urls:
hreflist=hreflist+MainPageInformation(i)
time.sleep(1)
for i in hreflist:
sourceData.append(getPageInformation(i))
time.sleep(1)
for i in sourceData:
print(i)
print('\n')
getMainPageInformation
from bs4 import BeautifulSoup
import requests
def MainPageInformation(url):
self_url=url
pageData=requests.get(self_url)
data=BeautifulSoup(pageData.text,'lxml')
href=data.select(' ul > li > a[class="resule_img_a"]')
hreflist=[]
for i in href:
hreflist.append(i.get('href'))
return hreflist
getPageInformation
from bs4 import BeautifulSoup
import requests
def getPageInformation(url):
self_url=url
self_pageData=requests.get(url)
self_data=BeautifulSoup(self_pageData.text,'lxml')
# print(self_data)
titles=self_data.select('div.pho_info > h4 > em')
roomImages=self_data.select('#curBigImage')
prices=self_data.select("div.day_l > span")
addresses=self_data.select('div.pho_info > p > span.pr5')
hosterImages=self_data.select('div.member_pic > a > img')
hosterName=self_data.select('div.w_240 > h6 > a')
# print(titles)
data={}
for title,roomImage,price,address,hosterImage,name in zip(titles,roomImages,prices,addresses,hosterImages,hosterName):
data={
'title':title.get_text(),
'roomImage':roomImage.get('src'),
'price':price.get_text(),
'address':address.get_text().strip(),
'hosterImage':hosterImage.get('src'),
'hosterName':name.get_text()
}
return data
# print(titles)
# print("-------------------------------------------")
# print(roomImages)
# print("-------------------------------------------")
# print(price)
# print("-------------------------------------------")
# print(address)
# print("-------------------------------------------")
# print(hosterImage)
# url='http://bj.xiaozhu.com/fangzi/269024901.html'
# getPageInformation(url)