技术选型
- scrapy vs requests+beautifulsoup
- requests+beautifulsoup都只是第三方模块,scrapy则是框架。
- scrapy框架中可以加入requests和beautifulsoup。
- scrapy基于twisted,性能是最大的优化。
- scrapy方便苦战,提供很多内置的功能。
- scrapy内置的css和xpath selector非常方便,beautifulsoup最大的缺点就是慢。
网页分类
- 常见网页类型
- 静态网页
- 动态网页
- webservice(restapi)
爬虫能做什么
- 爬虫作用
- 搜索引擎--百度、谷歌、垂直领域搜索引擎
- 推荐引擎--今日头条
- 机器学习的数据样本
- 数据分析(金融数据分析)、舆情分析等
正则表达式
- 正则表达式
-
特殊字符
[1]. ^ $ * ? + {2} {2,} {2,5} |
[2]. [] [^] [a-z] .
[3]. \s \S \w \W
[4]. [\u4E00-\u9FA5] () \d
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
"""
@author 金全 JQ
@version 1.0 , 2017/9/30
@description 正则表达式
"""
import re
line ='jinquan123'
# 以j开头任意结尾的字符串
regex_str = "^j.*"
# 前任意内容但是以3结尾
regex_end = ".*3$"
# 以j开头任意中间内容以3结尾
regex_all = "^j.*3$"
re.match(regex_str,line)
# 贪婪和非贪婪模式
line_greedy = 'booobbb123'
# 贪婪模式是默认从右边提取()代表子集
regex_greedy_right = ".*(b.*b).*"
match_result_right = re.match(regex_greedy_right,line_greedy)
# result "bb"
if match_result_right:
print (match_result_right.group(1))
# 贪婪模式结果从左边取并取右边
regex_greedy_all = ".*?(b.*b).*"
match_result_all = re.match(regex_greedy_all,line_greedy)
# result "booobbb"
if match_result_all:
print(match_result_all.group(1))
# 非贪婪模式从左边取
regex_greedy_left = ".*?(b.*?b).*"
match_result_left = re.match(regex_greedy_left,line_greedy)
# result booob
if match_result_left:
print (match_result_left.group(1))
# 限定池 +
line_limit = "booobbbaabb123"
regex_limit_greedy = ".*(b.*b).*"
match_result_limit = re.match(regex_limit_greedy,line_limit)
# result bb
if match_result_limit :
print (match_result_limit.group(1))
regex_limit_between = ".*(b.+b).*"
match_result_limit_between = re.match(regex_limit_between,line_limit)
# result baab
if match_result_limit_between:
print(match_result_limit_between.group(1))
# 限定池 {2} {2,} {2,4} 这里比较复杂 请自行调试
line_list = "booooobbbaaab123"
regex_list_low = ".*(b.{1}b).*"
match_result_list_low = re.match(regex_list_low,line_list)
if match_result_list_low:
print(match_result_list_low.group(1))
else:
print("none")
regex_list_all = ".*(b.{2,}b).*"
match_result_list_all = re.match(regex_list_all,line_list)
if match_result_list_all:
print(match_result_list_all.group(1))
regex_list_high = ".*(b.{2,3}b).*"
match_result_list_high = re.match(regex_list_high,line_list)
if match_result_list_high:
print(match_result_list_high.group(1))
# | 或的关系
line_or = "jinquan123"
regex_or_one = "jinquan|jinquan123"
match_result_one = re.match(regex_or_one,line_or)
if match_result_one:
print(match_result_one)
regex_or_two = "(jiquan|jinquan)123"
match_result_or_two = re.match(regex_or_two,line_or)
if match_result_or_two:
print(match_result_or_two.group(1))
regex_or_three = "((jiquan|jinquan)123)"
match_result_or_three = re.match(regex_or_three,line_or)
if match_result_or_three:
print(match_result_or_three.group(1))
# [] 作用
line_number = "18146456231"
regex_number_one = "(1[385][0-9]{9})"
match_result_number_one = re.match(regex_number_one,line_number)
if match_result_number_one:
print(match_result_number_one.group(1))
# [^0] 不等于0
regex_number_one = "(1[385][^0]{9})"
match_result_number_one = re.match(regex_number_one,line_number)
if match_result_number_one:
print(match_result_number_one.group(1))
# \s 代表空格 \S表示不为空格都可以
line_str_nbsp = "你 好"
regex_nbsp_one ="(你\s好)"
match_result_nbsp = re.match(regex_nbsp_one,line_str_nbsp)
if match_result_nbsp:
print(match_result_nbsp.group(1))
# \w类似[A-Za-z0-9_] \W表示不为这些的时候
line_str_w = "你m好"
regex_w_one ="(你\w好)"
match_result_w = re.match(regex_w_one,line_str_w)
if match_result_w:
print(match_result_w.group(1))
# [\u4E00-\u9FA5]表示中文内容
line_str_c = "你好"
regex_c_one ="([\u4E00-\u9FA5]+)"
match_result_c = re.match(regex_c_one,line_str_c)
if match_result_c:
print(match_result_c.group(1))
else:
print("none")
line_str_c_two = "study in 滁州学院"
regex_c_two =".*?([\u4E00-\u9FA5]+学院)"
match_result_c_two = re.match(regex_c_two,line_str_c_two)
if match_result_c_two:
print(match_result_c_two.group(1))
else:
print("none")
# \d数字提取
line_number_year = "xxx出生于1994年12月12日"
regex_year_mounth_day = ".*?((\d+)年(\d+)月(\d+)日)"
match_result_year_mounth_day = re.match(regex_year_mounth_day,line_number_year)
if match_result_year_mounth_day:
print(match_result_year_mounth_day.group(1))
else:
print("none")
# 日期提取
line_year_mounth_day_one = "XXX出生于1994年1月12日"
line_year_mounth_day_two = "XXX出生于1994-1-12"
line_year_mounth_day_three = "XXX出生于1994/1/12"
line_year_mounth_day_four = "XXX出生于1994-01-12"
line_year_mounth_day_five = "XXX出生于1994-01"
regex_year_mounth_day_all = ".*出生于(\d{4}[年/-]\d{1,2}([月/-]\d{1,2}|[月/-]$|$))"
match_result_year_mounth_day_one = re.match(regex_year_mounth_day_all,line_year_mounth_day_one)
if match_result_year_mounth_day_one:
print(match_result_year_mounth_day_one.group(1))
match_result_year_mounth_day_two = re.match(regex_year_mounth_day_all,line_year_mounth_day_two)
if match_result_year_mounth_day_two:
print(match_result_year_mounth_day_two.group(1))
match_result_year_mounth_day_three = re.match(regex_year_mounth_day_all,line_year_mounth_day_three)
if match_result_year_mounth_day_three:
print(match_result_year_mounth_day_three.group(1))
match_result_year_mounth_day_four = re.match(regex_year_mounth_day_all,line_year_mounth_day_four)
if match_result_year_mounth_day_four:
print(match_result_year_mounth_day_four.group(1))
match_result_year_mounth_day_five = re.match(regex_year_mounth_day_all,line_year_mounth_day_five)
if match_result_year_mounth_day_five:
print(match_result_year_mounth_day_five.group(1))
深度优先和广度优先
- 目录
- 网站树结构
- 深度优先算法和实现(栈)
- 广度优先算法和实现(队列)
爬虫去重
- url存入数据库
- url存入set中,需要o(1)的代价查询100000000 * 2byte * 50个字符/1024/1024/1024 = 9G
- url经过md5等方法保存set中
- 用bitmap方法,url通过hash函数映射
- bloomfilter方法对bitmap进行改进,多重hash函数降低冲突
- 原视频UP主慕课网(聚焦Python分布式爬虫必学框架Scrapy 打造搜索引擎)
- 本篇博客撰写人: XiaoJinZi 个人主页 转载请注明出处
- 学生能力有限 附上邮箱: 986209501@qq.com 不足以及误处请大佬指责