from html.parser import HTMLParser
from html.entities import name2codepoint
class MyHTMLParser(HTMLParser):
# def handle_starttag(self, tag, attrs):
# print('<%s>' % tag)
# def handle_endtag(self, tag):
# print('</%s>' % tag)
# def handle_startendtag(self, tag, attrs):
# print('<%s/>' % tag)
def handle_data(self, data):
print(data.strip())
# def handle_comment(self, data):
# print('<!--', data, '-->')
# def handle_entityref(self, name):
# print('&%s;' % name)
# def handle_charref(self, name):
# print('&#%s;' % name)
parser = MyHTMLParser()
import requests
import re
url = 'https://new.qq.com/omn/20200418/20200418A0QEEO00.html'
rep = requests.get(url)
#rep.encoding = rep.encoding
data = rep.text
x = re.search(r'<div class="LEFT">([\s\S]*)<div id="RIGHT" class="RIGHT">', data, re.M)
parser.feed(x.group(1))
输出:新闻的文本内容