这个爬虫比较简单,一个固定URL,数据获取也比较有规律。最后是把获取到到书名、简介、评分、作者出版社信息写入本地的一个文件中
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def get_latest_book():
url='https://book.douban.com/latest?icn=index-latestbook-all'
response=requests.get(url)
bookList=[]
soup=BeautifulSoup(response.content,'html.parser')
bookNameList=soup.select('.article .detail-frame a')
bookRateList=soup.select('.article .color-lightgray')
bookInfoList=soup.select('.article .color-gray')
bookDetailList=soup.select('.article .detail')
for v in range(len(bookNameList)):
bookDict={}
bookDict={
'bookName':bookNameList[v].text,
'bookRate':bookRateList[v].text,
'bookInfo':bookInfoList[v].text,
'bookDetail':bookDetailList[v].text
}
bookList.append(bookDict)
return bookList
def write_file():
with open('豆瓣新书速递.txt','w+') as f:
bookList=get_latest_book()
for book in bookList:
bookName=book['bookName']
bookRate=book['bookRate']
bookInfo=book['bookInfo']
bookDetail=book['bookDetail']
f.write(bookName)
f.write(bookRate)
f.write(bookInfo)
f.write(bookDetail)
print '数据写入完毕'
write_file()