cbioportal爬取

import sys,os,re
import time
import shutil
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
#from selenium.webdriver.support import Thread
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support import expected_conditions as EC
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.dir', '/tmp/mozilla_baowenjuan0')
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('network.proxy.http','222.73.130.111')
profile.set_preference('network.proxy.http_port', 888)
profile.update_preferences()
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', '/tmp/mozilla_baowenjuan0')
option = webdriver.FirefoxOptions()
option.add_argument('--headless')  #要用无头的哦~
dr = webdriver.Firefox(options=option,firefox_profile=profile)
dr.get("https://www.cbioportal.org/patient?studyId=blca_bgi&caseId=B10")

我们要找href的页面

我们提取Sample ID列里面的链接，就到另一个页面：

我们要提取的页面

先要把所有点都显示出来，模拟点击“show more”，然后还要把复选框里的都选择了。再把表格里的内容爬下来。

def extract_mutations(link,dr_sub):
    print ("checking this link:"+link)
    dr_sub.get(link)
    time.sleep(4)
    count=8
    while dr_sub.find_elements_by_xpath('//div/button[@type="button" and @id="showMoreButton"]')[0].is_enabled() and count>0:
        dr_sub.find_elements_by_xpath('//div/button[@type="button" and @id="showMoreButton"]')[0].click()
        time.sleep(1.5)
        count-=1
    #需要选择展示的列（坐标等信息）
    dr_sub.find_elements_by_xpath('//div/button[@id="dropdown-custom-1"]')[0].click()
    checkboxes=dr_sub.find_elements_by_xpath("//*[@type='checkbox']")
    for checkbox in checkboxes:#选择所有的复选框
        if not checkbox.is_selected():
            checkbox.click()
    trlist_sub=dr_sub.find_elements_by_xpath('//tbody/tr')#找到表格的每一行
    for tr in trlist_sub:
        list_sub=[]
        tdlist=tr.find_elements_by_xpath('//td')
        time.sleep(1)
        for td in tdlist:
            cont=td.text
            list_sub.append(cont)
        print([a for a in list_sub])

开始测试的时候不小心弄了一个死循环，结果直接被封IP：

urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=40283): Max retries exceeded with url: /session/0023ea81-f0c1-4cdc-a22b-4bfee9c75eb4/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f5f090e7b70>: Failed to establish a new connection: [Errno 111] Connection refused',))

于是换IP，就是前面配置的那两行：（实际循环跑需要不断换IP，要不然会断，反爬好敏感啊~）

profile.set_preference('network.proxy.http','222.73.130.111')
profile.set_preference('network.proxy.http_port', 888)

拿一个测试的：

def main():
    dr = webdriver.Firefox(options=option,firefox_profile=profile)
    main_link='https://www.cbioportal.org/study/clinicalData?id=blca_bgi'
    dr.get(main_link)
    time.sleep(10)
    trlist=dr.find_elements_by_xpath('//tbody/tr')
    for tr in trlist:
        content=[]
        tdlist=tr.find_elements_by_tag_name('td')
        count=0
        for td in tdlist:
            if td.find_element_by_xpath('//td/a[@target="_blank"]') and count<2:
                link_1=td.find_element_by_xpath('//td/a[@target="_blank"]').get_attribute('href')
                if count==1:extract_mutations(link_1,dr)
                ```后面的不用看，本想把父页面也打印的，但发现子页面有这个信息
                sample_ID=td.text
                content.append(link_1)
                content.append(sample_ID)
                count+=1
               ```
            ```
            else:
                content_tmp=td.text
                content.append(content_tmp)
            ```
        #print ([a for a in content])

if __name__ == '__main__':
    main()

结果直接打印了，可以作为脚本跑，输出到文件。

最后编辑于：2021.05.26 18:00:06

cbioportal爬取

推荐阅读更多精彩内容