【原】当科研遇见python

微生信生物 2021-01-16

展开全文

R语言分析技术
扩增子专题
基于phyloseq的微生物群落分析
代谢组专题
当科研遇见python
杂谈
所需模块
定义函数
扫描下方二维码加入群聊
当科研遇见python
python爬虫爬取nature网站
历史目录

很高兴开展这一专栏的写作，本专栏作者抱起大块块将python之道结合科学研究以别样的方式让我们逐渐明朗--当科研遇见python 两者会产生怎样的火花呢？

下面来看看我们抱起大块块的表演：

python爬虫爬取nature网站

我们知道nature是开放性期刊，并且是静态的，爬取非常容易，今天我将为演示如何通过关键词，使用python爬取nature网站。

本函数运行

所需模块

import requests
import bs4
from bs4 import BeautifulSoup
import traceback
import re
import time
from fake_useragent import UserAgent
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

定义函数

#获取url

def getHTMLText(url):#获取url
    try:
        ua = UserAgent()
        user_agent = ua.random
        print(user_agent)
        headers = {'User-Agent': user_agent, 'Connection': 'close'}
        r = requests.get(url, timeout = 30,headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('网络连接错误')

下载PDF函数

def downPDF(tag):
    for i in range(len(tag)):#下载pdf
        downURL='https://www./articles/'+herf[i]+'.pdf'
        r = requests.get(downURL)
        f=open('生成文件/'+herf[i]+'.pdf','wb')
        f.write(r.content)
        f.close()

获得文献ID

def getID(html,herf):#获得文献ID
    tag = re.findall(r'href="/articles/s\w*-\w*-\w*-\w*', html)  # 获取id
    for i in range(len(tag)):
        herf.append(tag[i].split('s/')[1])

获取DOI号

def getList(herf,list):
    for i in herf:
        r = requests.get('https://www./articles/'+i)
        print(i)
        html = r.text
        soup = BeautifulSoup(html,'html.parser')
        title = soup.find(attrs={'name':'dc.title'})['content']
        #publisher =soup.find(attrs={'name':'dc.publisher'})['content']
        description =soup.find(attrs={'name':'dc.description'})['content']
        doi = soup.find(attrs={'name':'prism.doi'})['content']
        creator =soup.find(attrs={'name':'dc.creator'})['content']
        list.append(title)
        list.append(creator)
        list.append(doi)
        list.append(description)
        time.sleep(5)
        #list.append(doi)

计数函数

def printWenxianList(list):
    tplt = '{:10}\t{:10}\t{:10}\t{:10}\t{:30}'
    print(tplt.format("序号","题目","作者","DOI","摘要"))
    count = 0
    for g in list:
        count=count+1
        print(tplt.format(count,g[0],g[1],g[2],g[3]))'''

主函数这里设置查找10页，根据自己需求更改页数

def main():
    key = input('请输入关键字')
    #key='iron'
    depth =10
    start_url ='https://www./search?q='+key
    herf = []
    list = []
    for i in range(1,depth):
        try:
            if i==1:
                url=start_url
            else:
                url=start_url+'&page='+str(i)
            html = getHTMLText(url)
            getID(html,herf)
        except:
            print('程序错误')
    print(herf)
    getList(herf,list)
    print(list)
    data = np.array(list).reshape(int(len(list) / 4), 4)
    df = DataFrame(data, columns=['title', 'author', 'doi', '摘要'])
    df.to_csv('生成文件/ceshi.csv', sep='?')
main()
#print(herf)
#print(herf)