https://en.wikipedia.org/wiki/Wikipedia
爬一个词相关的2层词
维基每个词条的url 是这样
https://en.wikipedia.org/wiki/词
对应正则是 <a href="/wiki/([^:#=<>]?)".?</a>
比如 Wikipedia这个词 url是
https://en.wikipedia.org/wiki/Wikipedia
请求头
主要是 我是谁:
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
拿到页面html , 异常也继续, 也记入爬过的list
try:
headers = \
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
r = requests.get("https://en.wikipedia.org/wiki/" + current_word, headers=headers)
html = r.text
except Exception as e:
print('Failed downloading and saving', current_word)
print(e)
exist_url.append(current_word)
return None
正则<a href="/wiki/([^:#=<>]?)".?</a> 找到相关的词条
要记录爬过的词, 防止爬重复了
# 记录已经爬过的url
exist_url.append(current_word)
link_list = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>', html) # 本页面的维基url
unique_list = list(set(link_list) - set(exist_url)) # 没爬过的
每个词记录到文件link_12-3.txt, 下一层,深度优先 递归爬2层
for wordInThisPage in unique_list:
g_writecount += 1
output = "No." + str(g_writecount) + "\t Depth:" + str(depth) + "\t" + current_word + ' -> ' + wordInThisPage + '\n'
# 输出到 文件link_12-3.txt
with open('link_12-3.txt', "a+") as f:
f.write(output)
f.close()
if depth < 2:# 递归
scrappy(wordInThisPage, depth + 1)
深度优先完整代码
import requests
import re
import time
time1 = time.time()
exist_url = [] # 记录已经爬过的
g_writecount = 0
def scrappy(current_word, depth=1):
# 全局变量要写 必须这样global
global g_writecount
print(current_word)
try:
headers = \
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
r = requests.get("https://en.wikipedia.org/wiki/" + current_word, headers=headers)
html = r.text
except Exception as e:
print('Failed downloading and saving', current_word)
print(e)
exist_url.append(current_word)
return None
# 记录已经爬过的url
exist_url.append(current_word)
link_list = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>', html) # 本页面的维基url
unique_list = list(set(link_list) - set(exist_url)) # 没爬过的
for wordInThisPage in unique_list:
g_writecount += 1
output = "No." + str(g_writecount) + "\t Depth:" + str(depth) + "\t" + current_word + ' -> ' + wordInThisPage + '\n'
# 输出到 文件link_12-3.txt
with open('link_12-3.txt', "a+") as f:
f.write(output)
f.close()
# 第一个关联词条 往下递归 深度优先
if depth < 2:
scrappy(wordInThisPage, depth + 1)
scrappy("Wikipedia")
time2 = time.time()
print ("Total time", time2 - time1)
广搜多线程
每一层, 尽量每个词开一个线程爬(如果线程池内线程不敢,就下一批)
#!/usr/bin/env python
# coding=utf-8
import threading
import requests
import re
# 作为锁
g_mutex = threading.Condition()
g_row_queue_word = [] # 下一层 等待爬取的word(它自己已经记入文件, 它相关的词汇还没记入
g_existWord = [] # 这里面的词, 它关联的词都已经记入文件了
g_write_count = 0 # 已记入的词条数
class Crawler:
def __init__(self, word, threadnum):
self.word = word
self.thread_num = threadnum
self.thread_pool = []
def craw(self):
g_row_queue_word.append(word)
depth = 1
while depth < 3:
print('Searching depth ', depth, ' ...\n')
self.download_all() # 这层全部爬完 并更新好下一层待爬词
depth += 1
# 把本层的词 用多线程爬完
def download_all(self):
global g_row_queue_word # 待爬的词条
had_craw_count = 0 # 已爬记录
# 循环 直到这层的待爬词 都爬了
while had_craw_count < len(g_row_queue_word):
# 增加的线程到线程池允许的数量 或 词条需要的数量
start_thread = 0
while start_thread < self.thread_num and had_craw_count + start_thread < len(g_row_queue_word):
self.download(g_row_queue_word[had_craw_count + start_thread], start_thread)
start_thread += 1
# 线程池里面的线程 运行完
for thread in self.thread_pool:
thread.join(30)
had_craw_count += start_thread
g_row_queue_word = []
# 调用多线程爬虫
def download(self, url, tid):
craw_thread = CrawlerThread(url, tid)
self.thread_pool.append(craw_thread)
craw_thread.start()
class CrawlerThread(threading.Thread): # 爬虫线程
def __init__(self, word, tid):
threading.Thread.__init__(self)
self.word = word
self.tid = tid
# 把 word 页面里面出现的词条 写入文件, 已经写过的词记录一下g_existWord 页面本身 记入g_pages(下一个以此出发)
def run(self):
global g_mutex
global g_write_count
global g_row_queue_word
write_file_words = []
try:
print(self.tid, "crawl ", self.word)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
r = requests.get("https://en.wikipedia.org/wiki/" + self.word, headers=headers)
html = r.text
link_list2 = re.findall('<a href="/wiki/([^:#=<>]*?)".*?</a>', html)
unique_list2 = list(set(link_list2))
for eachone in unique_list2:
g_write_count += 1
content2 = "No." + str(g_write_count) + "\t Thread" + str(
self.tid) + "\t" + self.word + '->' + eachone + '\n'
with open('title2.txt', "a+") as f:
f.write(content2)
f.close()
write_file_words.append(eachone)
except Exception as e:
print('Failed downloading and saving', self.word)
print(e)
return None
# 原子操作
g_mutex.acquire()
g_row_queue_word = list(set(g_row_queue_word + write_file_words))
g_existWord.append(self.word)
g_mutex.release()
if __name__ == "__main__":
word = "Wikipedia"
thread_num = 5
crawler = Crawler(word, thread_num)
crawler.craw() # 开爬