1.爬取豆瓣电影前250详情页面
豆瓣电影前250详情页面持久化为250个htm文件,打包文件下载链接: https://pan.baidu.com/s/1_zlZJQJtl9pPEJUGYVMYaw 密码: ehrq
文件解压后的文件夹命名为doubanSourcePages,下面代码复制到py文件中,py文件和doubanSourcePages文件夹在同一级目录下。
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def cssFind(movie,cssSelector,nth=1):
if len(movie.select(cssSelector)) >= nth:
return movie.select(cssSelector)[nth-1].text.strip()
else:
return ''
def reFind(pattern,sourceStr,nth=1):
if len(re.findall(pattern,sourceStr)) >= nth:
return re.findall(pattern,sourceStr)[nth-1]
else:
return ''
if __name__ == "__main__":
movie_list =[]
for i in range(1,251):
print("正在解析排名第%d页"%i)
fileName = "doubanSourcePages/%03d.html"%i
try:
with open(fileName,encoding='utf8') as file:
soup = bs(file,'lxml')
movie = {}
movie['得分'] = cssFind(soup, 'strong[class="ll rating_num"]')
movie['片名'] = cssFind(soup, 'span[property="v:itemreviewed"]')
info = cssFind(soup, "div[id='info']")
for item in info.split('\n')[:9]:
key = item.split(":")[0]
value = item.split(":")[1]
movie[key] = value
movie_list.append(movie)
movie['标签'] = ','.join([k.text for k in soup.select("div.tags a")])
movie['图片链接'] = soup.select('a.nbgnbg img')[0]['src']
except:
print("解析排名第%d页失败"%i)
movie_list.append({})
df = pd.DataFrame(movie_list,columns=movie_list[0].keys())
df.to_excel("豆瓣电影详情信息.xlsx")
2.详情页面持久化
代码如下:
from bs4 import BeautifulSoup as bs
import requests
from time import sleep
def save_webPage(url,fileName):
response = requests.get(url)
response.encoding = 'utf-8'
with open(fileName,'w',encoding='utf-8') as file:
file.write(response.text)
if __name__ == "__main__":
#解析网页并将每条电影信息插入mysql数据库
url_before = "https://movie.douban.com/top250?start={}"
count = 0
for i in range(0,250,25):
url = url_before.format(i)
fileName = "{}-{}.html".format(i+1,i+25)
save_webPage(url,fileName)
response = requests.get(url)
response.encoding = 'utf-8'
soup = bs(response.text, 'lxml')
movie_list = soup.select("ol.grid_view li")
for movie in movie_list:
nextUrl = movie.select("div.hd a")[0]['href']
count +=1
fileName = "%03d.html"%count
print("正在把排名第%d的电影详情页面保存到本地"%count)
save_webPage(nextUrl,fileName)
sleep(3)
3.人员随机分组
import random
def getGroup(lt,n):
lt_len = len(lt)
left = lt_len%n
m = lt_len//n
group_number_list = [m] * (n-left) + [m+1] * left
random.shuffle(group_number_list)
group_list = []
print(group_number_list)
for group_number in group_number_list:
group = random.sample(lt,group_number)
print(group)
for i in group:
lt.remove(i)
group_list.append(group)
return group_list
if __name__ == "__main__":
name_str = "陶宇,王燕琪,雷杰,韦民童,余鹏,李波,雷坤,"\
"石月,丁松,郑志杰,陶雨,程韶曦,葛振刚,王雪虎,李响,仲雯,王海宾"
name_list = name_str.split(',')
getGroup(name_list,4)