1.0 从img.html文件中过滤出所有的图片链接,把其放到列表中
2.0 根据过滤出来的连接去网上下载图片,放到img文件家中
3.0 给定一个网址,给我下载其中所有的图片到image文件夹中
要求用协程实现
1.0 从img.html文件中过滤出所有的图片链接,把其放到列表中
import re
def main():
with open("image.html", "rb") as file:
f = file.read()
ret = re.findall(r"https://.*?.jpg", f.decode("utf-8"))
for i in ret:
print(i)
if name == "main":
main()
2.0 根据过滤出来的连接去网上下载图片,放到img文件家中
import re
import urllib.request
def downloader(img_name, img_url):
# 请求图片的源码
req = urllib.request.urlopen(img_url)
img_content = req.read()
with open("images/" + img_name, "wb") as f:
f.write(img_content)
print(img_url)
def main():
# 代表网页的源码
# header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
# r = urllib.request.Request(url, headers=header)
# 先是以字符的形式去读,但是我们并不认识字符形式,所以要解码成字节形式!!
with open("image.html", "rb") as file:
f = file.read()
# 获得的源码是字节形式, 要解码成字符形式, 然后遍历字符!!
ret = re.findall(r"https://.*?\.jpg", f.decode("utf-8"))
for i in range(len(ret)):
downloader("%d.jpg" % i, ret[i])
if name == "main":
main()
3.0 给定一个网址,给我下载其中所有的图片到image文件夹中
要求用协程实现
import re
import urllib.request
import gevent
from gevent import monkey
import socket
monkey.patch_all()
def downloader(img_name, img_url):
# 请求所有图片的源码!!
try:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
, "Accept-Language": "en"}
r = urllib.request.Request(img_url, headers=header)
req = urllib.request.urlopen(r)
img_content = req.read()
with open("img/" + img_name, "wb") as f:
f.write(img_content)
print(img_url)
except Exception as e:
print(e)
def main():
# 设置超时, 如果超过3秒钟没有响应, 就抛出异常, 不要再等待
# socket.setdefaulttimeout(3)
url = "https://www.douyu.com/directory/game/yz"
# 直接读取上面网页源代码, 某些网站会返回403, 阻止爬取内容
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
r = urllib.request.Request(url, headers=header)
req = urllib.request.urlopen(r)
content = req.read().decode()
# 提取所有的图片链接
arr = re.findall(r"(https?://[a-zA-Z0-9_?&./-=%]*?\.jpg)", content)
print(arr)
task_list = []
for i in range(len(arr)):
task_list.append(gevent.spawn(downloader, "%d.jpg" % i, arr[i]))
gevent.joinall(task_list)
if name == 'main':
main()