import unittest
import time
import re
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
class seleniumTest(unittest.TestCase):
def setUp(self):
# 调试的时候用firefox比较直观
# self.driver = webdriver.PhantomJS()
try:
self.driver = webdriver.Firefox()
except Exception as e:
print(e)
try:
self.driver = webdriver.Chrome()
except Exception as e:
print(e)
def testEle(self):
driver = self.driver
# 浏览器窗口最大化
driver.maximize_window()
driver.get("http://alk.12348.gov.cn/LawMultiSearch?checkDatabaseID=28%2C29%2C30%2C31%2C67%2C36%2C68%2C69%2C70%2C71")
time.sleep(3)
file = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/case.txt', 'w')
fh = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/essay.txt', 'w')
while True:
soup = BeautifulSoup(driver.page_source, 'xml')
tab = soup.find_all('a', {'target': '_blank'}) # 案例网址
for url in tab:
file.write(str(url)+'\n')
if driver.page_source.find('page-next') == -1:
break
# 找到“下一页”的按钮
elem = driver.find_element_by_class_name('page-next')
# 点击“下一页”
elem.click()
time.sleep(1)
file.close()
file = open('C:/Users/ergou/Documents/PycharmProject/Community_corrections/case.txt', 'r')
data = file.read()
# print(data)
dbid = 'dbID=(.*?)&dbName='
dbname = '&dbName=(.*?)&sysID='
sysid = '&sysID=(.*?)" target="_blank'
dbidlist = re.compile(dbid).findall(data)
dbnamelist = re.compile(dbname).findall(data)
sysidlist = re.compile(sysid).findall(data)
for i in range(0, len(dbidlist)):
try:
url = "http://alk.12348.gov.cn/Detail?dbID=" + dbidlist[i] + "&dbName=" + dbnamelist[i] + "&sysID=" + \
sysidlist[i]
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'}
req = urllib.request.Request(url, headers=headers)
html_data = urllib.request.urlopen(req, timeout=1).read()
content = html_data.decode("utf-8", "ignore")
# print(content)
start = content.find(u"<!--案例内文开始-->")
end = content.find(u"<!--案例内文结束-->")
text = content[start:end]
# 过滤掉乱码
pat = re.compile('<!?/?\w+[^>]*>')
essay = pat.sub('', text)
# print(essay)
fh.write(str(essay) + '\n' * 3)
except Exception as e: # 抛出超时异常
print('a', str(e))
fh.close()
file.close()
def tearDown(self):
print('down')
if __name__ == "__main__":
unittest.main()
python+selenium爬取司法行政案例
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...