【原】當(dāng)科研遇見python

微生信生物 2021-01-16

展開全文

R語言分析技術(shù)
擴(kuò)增子專題
基于phyloseq的微生物群落分析
代謝組專題
當(dāng)科研遇見python
雜談
所需模塊
定義函數(shù)
掃描下方二維碼加入群聊
當(dāng)科研遇見python
python爬蟲爬取nature網(wǎng)站
歷史目錄

很高興開展這一專欄的寫作，本專欄作者抱起大塊塊將python之道結(jié)合科學(xué)研究以別樣的方式讓我們逐漸明朗--當(dāng)科研遇見python 兩者會(huì)產(chǎn)生怎樣的火花呢？

下面來看看我們抱起大塊塊的表演：

python爬蟲爬取nature網(wǎng)站

我們知道nature是開放性期刊，并且是靜態(tài)的，爬取非常容易，今天我將為演示如何通過關(guān)鍵詞，使用python爬取nature網(wǎng)站。

本函數(shù)運(yùn)行

所需模塊

import requests
import bs4
from bs4 import BeautifulSoup
import traceback
import re
import time
from fake_useragent import UserAgent
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

定義函數(shù)

#獲取url

def getHTMLText(url):#獲取url
    try:
        ua = UserAgent()
        user_agent = ua.random
        print(user_agent)
        headers = {'User-Agent': user_agent, 'Connection': 'close'}
        r = requests.get(url, timeout = 30,headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('網(wǎng)絡(luò)連接錯(cuò)誤')

下載PDF函數(shù)

def downPDF(tag):
    for i in range(len(tag)):#下載pdf
        downURL='https://www./articles/'+herf[i]+'.pdf'
        r = requests.get(downURL)
        f=open('生成文件/'+herf[i]+'.pdf','wb')
        f.write(r.content)
        f.close()

獲得文獻(xiàn)ID

def getID(html,herf):#獲得文獻(xiàn)ID
    tag = re.findall(r'href="/articles/s\w*-\w*-\w*-\w*', html)  # 獲取id
    for i in range(len(tag)):
        herf.append(tag[i].split('s/')[1])

獲取DOI號(hào)

def getList(herf,list):
    for i in herf:
        r = requests.get('https://www./articles/'+i)
        print(i)
        html = r.text
        soup = BeautifulSoup(html,'html.parser')
        title = soup.find(attrs={'name':'dc.title'})['content']
        #publisher =soup.find(attrs={'name':'dc.publisher'})['content']
        description =soup.find(attrs={'name':'dc.description'})['content']
        doi = soup.find(attrs={'name':'prism.doi'})['content']
        creator =soup.find(attrs={'name':'dc.creator'})['content']
        list.append(title)
        list.append(creator)
        list.append(doi)
        list.append(description)
        time.sleep(5)
        #list.append(doi)

計(jì)數(shù)函數(shù)

def printWenxianList(list):
    tplt = '{:10}\t{:10}\t{:10}\t{:10}\t{:30}'
    print(tplt.format("序號(hào)","題目","作者","DOI","摘要"))
    count = 0
    for g in list:
        count=count+1
        print(tplt.format(count,g[0],g[1],g[2],g[3]))'''

主函數(shù) 這里設(shè)置查找10頁，根據(jù)自己需求更改頁數(shù)

def main():
    key = input('請輸入關(guān)鍵字')
    #key='iron'
    depth =10
    start_url ='https://www./search?q='+key
    herf = []
    list = []
    for i in range(1,depth):
        try:
            if i==1:
                url=start_url
            else:
                url=start_url+'&page='+str(i)
            html = getHTMLText(url)
            getID(html,herf)
        except:
            print('程序錯(cuò)誤')
    print(herf)
    getList(herf,list)
    print(list)
    data = np.array(list).reshape(int(len(list) / 4), 4)
    df = DataFrame(data, columns=['title', 'author', 'doi', '摘要'])
    df.to_csv('生成文件/ceshi.csv', sep='?')
main()
#print(herf)
#print(herf)