爬取网址:https://music.douban.com/top250
爬取信息:歌曲名,表演者,流派,发行时间,出版者,评分
爬取方式:进入详细页面爬取,lxml,re解析。
存储方式:MongoDB存储
- 获取actor,style,publish_time,publisher字段时使用了正则表达式,相比定位标签定位信息,能更精确地匹配到信息,减少匹配错误。
- 使用语句
if len(publishers) == 0:
else:
来判断空信息。
import requests
from lxml import etree
import re
import pymongo
import time
def get_details_url(url):
r = requests.get(url,headers = headers)
html = etree.HTML(r.text)
song_urls = html.xpath('//a[@class="nbg"]/@href')
return song_urls
def get_info(url):
r = requests.get(url,headers=headers)
html = etree.HTML(r.text)
name = html.xpath('//div[@id="wrapper"]/h1/span/text()')[0]
actor = re.findall("表演者:.*?>(.*?)</a>",r.text,re.S)[0]
styles = re.findall(r"流派:</span> (.*?)<br />",r.text,re.S)
if len(styles) == 0:
style = "未知"
else:
style = styles[0].strip()
publish_time = re.findall(r"发行时间:</span> (.*?)<br />",r.text,re.S)[0].strip()
publishers = re.findall(r"出版者:</span> (.*?)<br />",r.text,re.S)
if len(publishers) == 0:
publisher = "未知"
else:
publisher = publishers[0].strip()
score = html.xpath('//strong[@class="ll rating_num"]/text()')[0]
#print(name,actor,style,publish_time,publisher,score)
info = {
'歌曲名':name,
'表演者':actor,
'流派':style,
'发行时间':publish_time,
'出版者':publisher,
'评分':score
}
topmusic.insert_one(info) ##插入数据,保存到数据库中。
if __name__=="__main__":
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
topmusic = mydb['topmusic'] #连接数据库,并创建数据库和集合
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
url_list = ['https://music.douban.com/top250?start={}'.format(i*25) for i in range(0,10)] #共10页
for url in url_list:
song_urls = get_details_url(url)
for song_url in song_urls:
get_info(song_url)
time.sleep(2)