就是照搬被人文章到公众号上,一般格式是不能复制粘贴的,怎么办呢,爬源码
import requests
import re
import time
from lxml import html
from selenium import webdriver
r = requests.get(url='https://mp.weixin.qq.com/s?__biz=MzA5NjgxNjgxNQ==&mid=403557217&idx=1&sn=3b8038565f9c699a0121f64aed2f5d22&mpshare=1&scene=1&srcid=1206O2RAeNX16c88CbMrryCI&key=f57fc7001c9b61fadf60eb0d80c982c3f9b772f324115b802c9c69eba4603a5f6da7bf5ee9975261ac5812427e154113c8c2eba3f19dbf10c35ae2251b4f6aed955bd68532a3f4248069b54851973942&ascene=0&uin=MjEyODY1MzIwMQ%3D%3D&devicetype=iMac+MacBookPro11%2C1+OSX+OSX+10.12.3+build(16D32)&version=11000003&pass_ticket=5jR8RnNSI7woS8zm30GvzXC2C8NHS5ayD4%2B7qltAzc%2FzfQgzX4KOt1d3LtJrvfVD') # 最基本的GET请求
r.S是指可以换行匹配,不然查找不到,真是坑死人了
content = re.findall(r'<div class="rich_media_content " id="js_content">.*?</div>',r.text, re.S)
然而这里有个问题,这里获取的是网页code与网页里看到的element不一致,网页是执行了所有js请求后情况,搜索了下,无解,换一个办法
自动化工具selenium,这个是动态的
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://mp.weixin.qq.com/s?__biz=MzA5NjgxNjgxNQ==&mid=403557217&idx=1&sn=3b8038565f9c699a0121f64aed2f5d22&mpshare=1&scene=1&srcid=1206O2RAeNX16c88CbMrryCI&key=f57fc7001c9b61fadf60eb0d80c982c3f9b772f324115b802c9c69eba4603a5f6da7bf5ee9975261ac5812427e154113c8c2eba3f19dbf10c35ae2251b4f6aed955bd68532a3f4248069b54851973942&ascene=0&uin=MjEyODY1MzIwMQ%3D%3D&devicetype=iMac+MacBookPro11%2C1+OSX+OSX+10.12.3+build(16D32)&version=11000003&pass_ticket=5jR8RnNSI7woS8zm30GvzXC2C8NHS5ayD4%2B7qltAzc%2FzfQgzX4KOt1d3LtJrvfVD')
time.sleep(60)
sleep就是让网页加载完成后在获取需要的内容
import codecs
content = re.findall(r'<div class="rich_media_content " id="js_content">.*?</div>',browser.page_source, re.S)
# 去掉换行符号把content写到文件
new_content = content[0].replace('\n', '')
#print new_content
file_obj = codecs.open("/Users/xxx/Desktop/markdown/7.8.md", 'w', 'utf-8')
file_obj.write(new_content)
file_obj.close()
这时候用vim打开7.8.md时打开的网页就是和原网页一摸一样的网页,发现有些图片不显示,毕竟是别人公众号的图片
imgs = re.findall(r'\"http://.*?\"', content[0], re.S)
re.S)
for img in imgs:
print img
print
可以将图片上传到自己公众号,没有认证只能上传临时素材
import json
# 我的token开发者有接口可以获取,我拷贝过来用下
access_token="_RyG5BzY0Ait19ctrYtCmHe5-FT5VVqUy14HFFsa7BZbtq9btBE6diEFem6yjiuinZD7xApbqbJO6nwKhx99N9V2ClmPeUHHIthUqhkjH2XPKqB7S8u6Yc0bprsjh8GDVEEjAEALUU"
pp=requests.get("http://mmbiz.qpic.cn/mmbiz/x0QjkAOuB5YoQpVBrCWVdouMKd1UxjYhiaXnfQ3vF7KHiaFhQe91Gtsd1cNXZYzHoaGSpv2ak2M8pb9icSEkBKic1A/0?wx_fmt=jpeg").content # get the online png data (binary data)
files = {'media': ('temp2.png',pp)} # the first item "temp2.png" is the file name, the second one is the file data
upload_url="https://api.weixin.qq.com/cgi-bin/media/upload?access_token="+access_token+"&type=image" # set your access_token
r1 =requests.post(upload_url, files=files) # upload
media_id=json.loads(r1.content)['media_id'] # if it is success, you get media id
再使用media_id获取图片,得到图片网址
getload_url = "https://api.weixin.qq.com/cgi-bin/media/get?access_token="+access_token+"&media_id="+media_id
pp=requests.get(getload_url) # get the online png data (binary data)
print dir(pp)
print pp.url
只要把这个网页换掉之前的网页,一篇文章就出来了