分享

python爬虫获取hsk动态作文语料库语料

 岳士君 2015-12-15

# 将程序、keyword文件放在桌面,执行程序。注意:keyword文本文件需每词一行。

import requests
import bs4
import re

base_url = 'http://202.112.195.192:8060/hsk/'

# 获得关键词的html页面-第一页
def get_page(keyword, page_num=None):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36',}
    cookies = dict(ASPSESSIONIDSQASCRST='JKENNPPCMNBHKOGDCCLIIHPA', zwyl='rights=2&utime=2015%2F12%2F14+9%3A12%3A21&username=rebellion51')
    params = dict(keyword=keyword.encode('gb2312'),  kind='ci', radiobutton='all', page=page_num)
    response = requests.get(base_url+'googlecom.asp', headers=headers, cookies=cookies, params=params)
    html = response.content.decode('gbk')
    soup = bs4.BeautifulSoup(html, 'html.parser')
    return soup

# 解析关键词所在的第一页,获得总页数
def get_total_page_num(soup):
    total_page_num = re.findall(r'共\s*(\d+)\s*页', soup.find('div', id='Layer1').find('div').text.strip())
    return int(total_page_num[0])

# 解析html,获得句子
def get_sentences(html):
        tds = html.find('div', id='Layer1').find('table').find_all('td')[1::2]
        sentences = [td.text.replace('原始语料', '').strip() for td in tds]
        return sentences

# 将每个关键词出现的所有句子保存到文件
def save_sentences(html, keyword):
    with open(keyword+'.txt', 'at') as f:
        for i in html:
            f.write(i+'\n')

# 从keyword文件中获得关键词列表
with open('keyword.txt', 'rt') as f:
    keywords = f.readlines()
    keywords = [key.strip() for key in keywords]

# 循环打开每一页,获得句子并保存到文件
for keyword in keywords:
    total_page_num = get_total_page_num(get_page(keyword))
    print('词语:【{}】共{}页'.format(keyword, total_page_num))
    html = []
    for page_num in range(1, total_page_num+1):
        print('-----------------正在抓取第{}页------------'.format(page_num))
        html.extend(get_sentences(get_page(keyword, page_num)))
       
    save_sentences(html, keyword)
    print('词语:【{}】已抓取完成'.format(keyword))
  

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多