一、获取明星名字列表
安装模块:
(base) C:\Users\Administrator>pip3 install requests
安装 icrawler:
(base) C:\Users\Administrator>pip3 install icrawler
-
首先从百度搜索栏中搜索“明星”,显示出明星栏目,地区包括内地、香港、台湾、韩国和日本,如下图:
使用python爬虫将这些明星的名字爬取下来,代码如下所示:
import requests
import json
import urllib
def getManyPages(pages):
params = []
for i in range(0, 12 * pages + 12, 12):
params.append({
'resource_id': 28266,
'from_mid': 1,
'format': 'json',
'ie': 'utf-8',
'oe': 'utf-8',
'query': '明星',
'sort_key': '',
'sort_type': 1,
'stat0': '',
'stat1': '内地',
'stat2': '',
'stat3': '',
'pn': i,
'rn': 12
})
url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php'
# names = []
# img_results = []
x = 0
file = open('starName.txt', 'w',encoding='utf-8')
paths ="downloadFace/"
for param in params:
try:
response = requests.get(url, params=param)
response.encoding ="utf-8"
js = json.loads(response.text)
results = js.get('data')[0].get('result')
except AttributeError as e:
print(e)
continue
for result in results:
img_name = result['ename']
img_url = result['pic_4n_78']
# img_result = [img_name,img_url]
# img_results.append(img_result)
file.write("downloadFace/"+img_name + '\n')
# 打开imglist中保存的图片网址,并下载图片保存在本地,format格式化字符串
urllib.request.urlretrieve(img_url,'{0}{1}.jpg'.format(paths, img_name+str(x)))
x = x + 1
if x % 10 == 0:
print('第%d页......' % x)
x += 1
file.close()
if __name__ == '__main__':
# 10为页数
getManyPages(10)
注意:params里的‘query’:‘台湾明星’;‘start1’:‘台湾’,对应台湾地区的明星,修改这两个值可以获得‘内地’、‘香港’、‘韩国’等地区的明星。从图一可以看出,每页有12位明星,getManyPages(400)是获取400页的明星名单结果,也就是12*400=4800位明星名单,通过修改获取页码值来获取更多明星的名单,将获取的明星名单保存成文本文件,在后续操作中将会用到,同时也能避免代码终止又要重新爬取。
常见错误:
- 无法导入ssl模块
下载:Win64OpenSSL安装
https://slproweb.com/products/Win32OpenSSL.html
二、使用icrawler爬取数据
创建文件:KoreaStarName.txt
添加内容:
明星
import os
from icrawler.builtin import BingImageCrawler
path = r'BingImage/'
# 创建文件
file = open('KoreaStarName.txt', 'r',encoding='utf-8')
# 读取文件内容
lines = file.readlines()
for i, line in enumerate(lines):
name = line.strip('\n')
# 文件路径
file_path = os.path.join(path, name)
if not os.path.exists(file_path):
os.makedirs(file_path)
# 定义文件 存储路径
bing_storage = {'root_dir': file_path}
#
bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage)
bing_crawler.crawl(keyword=name, max_num=10)
print('第{}位明星:{}'.format(i, name))
三、数据清洗
主要针对非人脸图像的清洗,由于爬取的图像里有可能出现非人像图片,需要对它进行删除,通过对图像人脸判断来确认是否包含人脸图像,其中,使用face_recognition库检测图像上是否有人脸,new_path为清洗后保存到的人脸图像目录。
安装:
(base) C:\Users\Administrator>pip3 install face_recognition
import os
#(base) C:\Users\Administrator>pip3 install face_recognition
import face_recognition
from PIL import Image
from PIL import ImageFile
import threading
ImageFile.LOAD_TRUNCATED_IMAGES = True
def process_img(path, new_path):
dirs = os.listdir(path)
for pic_dir in dirs:
print(pic_dir)
dir_path = os.path.join(path, pic_dir)
pics = os.listdir(dir_path)
for pic in pics:
pic_path = os.path.join(dir_path, pic)
image = face_recognition.load_image_file(pic_path)
# 图像检测
face_locations = face_recognition.face_locations(image)
if len(face_locations) == 0:
continue
img = Image.open(pic_path)
new_pic_path = os.path.join(new_path, pic_dir)
if not os.path.exists(new_pic_path):
os.makedirs(new_pic_path)
if len(img.split()) == 4:
# 利用split和merge将通道从四个转换为三个
r, g, b, a = img.split()
toimg = Image.merge("RGB", (r, g, b))
toimg.save(new_pic_path + '\\' + pic)
else:
try:
img.save(new_pic_path + '\\' + pic)
except:
continue
print('Finish......!')
def lock_test(path, new_path):
mu = threading.Lock()
if mu.acquire(True):
process_img(path, new_path)
mu.release()
x=0
if __name__ == '__main__':
file_dir = r'downloadFace/'
def file_name(file_dir):
for root, dirs, files in os.walk(file_dir):
for file in files:
my_thread = threading.Thread(target=lock_test, args=(file, "newFace"+str(x)))
x = x + 1
my_thread.start()