求素数
# -*- coding: utf-8 -*-
# 循环只能到第二个数减1,所以第二个数是101
list = list()
for i in range(2,101):
for j in range(2,i+1):
mod = i % j
#如果循环最后到了,除非是除数等于被除数的时候,不能被整除,那就是素数
if (j == i):
list.append(i)
# 发现有整除的数就退出循环
if(mod == 0):
break
print "素数:",list
结果如下:
素数: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]
糗百首页爬虫:
# encoding=utf-8
from lxml import etree
import requests
def request():
url = 'http://www.qiushibaike.com/text/'
return requests.get(url)
def parse(text):
item = {"author":"", "sex":"", "age":"","content":"","votes":"","comments":""}
elements = etree.HTML(text)
elem_items = elements.xpath("//div[@class='article block untagged mb15']")
list = []
for elem_item in elem_items:
if (len(elem_item.xpath('div[1]/a[2]/h2/text()')) > 0):
item["author"] = elem_item.xpath('div[1]/a[2]/h2/text()')[0]
item["age"] = elem_item.xpath('div[1]/div[1]/text()')[0]
sex = elem_item.xpath('div[1]/div[1]')[0].get("class")
if sex == "articleGender manIcon":
item["sex"] = "男"
elif sex == "articleGender womenIcon":
item["sex"] = "女"
else:
item["sex"] = "未知"
# 解析匿名用户时,需要改变解析规则。
else:
item["author"] = elem_item.xpath("div[1]/span/h2/text()")[0]
item["sex"] = "未知"
item["age"] = "未知"
item["content"] = elem_item.xpath('a/div/span/text()')[0]
item["votes"] = elem_item.xpath('div[2]/span[1]/i/text()')[0]
item["comments"] = elem_item.xpath('*/span[@class = "stats-comments"]/a/i/text()')[0]
print(item)
list.append(item)
return list
if __name__ == '__main__':
res = request()
list = parse(res.text)
print("运行结束,爬取",len(list),"条数据。")
输出结果:
简书首页爬虫:
- 相比糗百的爬虫,简书的只需要设置一下请求头
header
,主要是设置User-Agent
,不然无法正常请求数据。 - 关于xpath提取阅读数和评论数为空的问题。
因为阅读数用xpath("div[2]/a[2]/text()")
实际提取出来的是这样的list,['\n ', ' 15\n']
,数据其实存放在第二个位置,所以需要[1]
来提取出来,而不是一成不变的使用[0]
,然后用strip()
去除字符串中的空格和换行符。 - 因为文章可能未收录到专题,此时就会解析不到专题的元素,需要判断这种情况。然后改变解析的策略,并将原本在
a[2], a[3]
号位的阅读数和评论数的顺序提前到a[1], a[2]
号位来解析。
# encoding=utf-8
from lxml import etree
import requests
def request():
url = 'http://www.jianshu.com/'
headers = {
"X-Infinitescroll": "true",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
return requests.get(url = url,headers = headers)
def parse(text):
item = {"author":"", "title":"", "time":"","reads":"","comments":"","likes":"","reward":"","subject":""}
elements = etree.HTML(text)
elem_items = elements.xpath('//li/div')
list = []
for elem_item in elem_items:
item["author"] = elem_item.xpath('div/div/a/text()')[0]
item["title"] = elem_item.xpath('a/text()')[0]
item["time"] = elem_item.xpath('div/div/span/@data-shared-at')[0]
item["likes"] = elem_item.xpath("div[2]/span[1]/text()")[0].strip()
reward_list = elem_item.xpath("div[2]/span[2]/text()")
item["reward"] = elem_item.xpath("div[2]/span[2]/text()")[0].strip() if(len(reward_list)>0) else "0"
subject_list = elem_item.xpath("div[2]/a[1]/text()")[0].strip()
if (len(subject_list) > 0):
item["subject"] = elem_item.xpath("div[2]/a[1]/text()")[0].strip()
item["reads"] = elem_item.xpath("div[2]/a[2]/text()")[1].strip()
item["comments"] = elem_item.xpath("div[2]/a[3]/text()")[1].strip()
else:
item["subject"] = "未收录专题"
item["reads"] = elem_item.xpath("div[2]/a[1]/text()")[1].strip()
item["comments"] = elem_item.xpath("div[2]/a[2]/text()")[1].strip()
print(item)
list.append(item)
return list
if __name__ == '__main__':
res = request()
list = parse(res.text)
print("运行结束,爬取",len(list),"条数据。")
输出结果: