抓取百度词相关内容,爬取,词语,的

发表时间:2020-11-16

需求:

根据HSK词汇表搜索相关词语,并爬取其中的拼音,释义、同义/近义/反义词

使用语言及编译器:

python
pycharm

目标网站:

百度汉语:https://hanyu.baidu.com/

目标网页分析:

网页首页无任何东西,需要搜索进行跳转
在这里插入图片描述
F12查看JavaScript加载后的网页源代码
在这里插入图片描述
右击查看网页源代码
在这里插入图片描述
通过对比,网页搜索跳转以后加载的是静态网页。故不需要进行逆向分析或者使用selenium库。

ps.F12调出的是网页html代码的集合,并不是完整的网页html代码

爬取思路

1、获取页面
2、创建一个字典,用于存储爬取的相关数据
3、将字典存储为json文件,方便导入MySQL
4、连接数据库,使用for循环爬取HSK考试常用词组

实现代码

导入相关的包

import urllib.request
from lxml import etree
from urllib.parse import urlencode, unquote
import requests
import re
import json
import time
import pymysql

1、获取页面

def Net(url,headers):
    try:
        request = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(request, timeout=0.7).read().decode("utf8")
        return html
    except:
        time.sleep(10)
        Net(url,headers)
def get_baidu_page(kw,url):
    # 获取html页面
    #模拟请求头
    headers = {
        'Accept': 'text / html, application / xhtml + xml, application / xml,*/*;q = 0.9;q = 0.8',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept - Language': 'zh - CN, zh;q = 0.9',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    html = Net(url,headers)
    content = etree.HTML(str(html))

2、创建字典

dic1 = {}
#因为字库中有些是单个字,有些是词语。网页对于字和词语有不同的页面布局。故加一个判断
if len(kw) == 1:
	link_list_pinyin = content.xpath('//div[@class="pronounce"]//b/text()')     #拼音
	link_list1 = content.xpath('//div//p/text()')            #详细信息
	link_synonym = content.xpath('//div[@id="synonym"]//a/text()')  # 近义词
	link_antonym = content.xpath('//div[@id="antonym"]//a/text()')  # 反义词
	link_redical = content.xpath('//li[@id="radical"]/span/text()')        #部首
	link_stroke = content.xpath('//li[@id="stroke_count"]/span/text()')        #笔画
	link_content = content.xpath('//div[@class="tab-content"]/a/text()')         #相关组词

	dic1["关键词"] = kw
	dic1['拼音'] = link_list_pinyin
	dic1["释义"] = link_list1
	dic1["近义词"] = link_synonym
	dic1["反义词"] = link_antonym
	dic1["部首"] = link_redical
	dic1["笔画"] = link_stroke
	dic1["相关组词"] = link_content
else:
	#获取详细信息
	link_list1 = content.xpath('//div//p/text()')
	link_list_pinyin = content.xpath('//div/dl/dt[@class="pinyin"]/text()')     #拼音
	link_synonym = content.xpath('//div[@id="synonym"]//a/text()')              #近义词
	link_antonym = content.xpath('//div[@id="antonym"]//a/text()')              #反义词

	dic1["关键词"] = kw
	dic1['拼音'] = link_list_pinyin
	dic1["释义"] = link_list1
	dic1["近义词"] = link_synonym
	dic1["反义词"] = link_antonym

3、存储文件

def save_file(dic):           #写入文件
    json_str = json.dumps(dic, ensure_ascii=False, indent=4)
    with open("result.json","a",encoding="utf8") as file1:
        file1.write(json_str)

4、连接数据库循环爬取

# 连接数据库
conn = pymysql.connect("localhost", "root", "123456", "sys")
cursor = conn.cursor()
sql = "select WORD from bucong"
cursor.execute(sql)
results = cursor.fetchall()
# kw = input("请输入要搜索的关键词: ")
for row in results[426:]:
	kw = row[0]
	print(kw)
	word = {"wd":kw}
	key = urllib.parse.urlencode(word)
	url = "https://hanyu.baidu.com/s"
	fullurl = url + "?" + key + "&ptype=zici"
	get_baidu_page(kw, fullurl)
	#对于有些字词百度汉语里面未收录,程序会报出异常。故需要加一个异常处理
	try:
		doSomething()
	except:
		pass

完整代码

"""一:百度词语爬虫"""
import urllib.request
from lxml import etree
from urllib.parse import urlencode, unquote
import requests
import re
import json
import time
import pymysql

def digui(url,headers):
    try:
        request = urllib.request.Request(url, headers=headers)
        html = urllib.request.urlopen(request, timeout=0.7).read().decode("utf8")
        return html
    except:
        time.sleep(10)
        digui(url,headers)


def get_baidu_page(kw,url):
    # 获取html页面
    #定义一个字典,存储我们想要的东西
    dic1 = {}
    """获取html页面"""
    #模拟请求头
    headers = {
        'Accept': 'text / html, application / xhtml + xml, application / xml,*/*;q = 0.9;q = 0.8',
        'Accept - Encoding': 'gzip, deflate, br',
        'Accept - Language': 'zh - CN, zh;q = 0.9',
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
    }
    html = digui(url,headers)
    content = etree.HTML(str(html))
    if len(kw) == 1:
        link_list_pinyin = content.xpath('//div[@class="pronounce"]//b/text()')     #拼音
        link_list1 = content.xpath('//div//p/text()')            #详细信息
        link_synonym = content.xpath('//div[@id="synonym"]//a/text()')  # 近义词
        link_antonym = content.xpath('//div[@id="antonym"]//a/text()')  # 反义词
        link_redical = content.xpath('//li[@id="radical"]/span/text()')        #部首
        link_stroke = content.xpath('//li[@id="stroke_count"]/span/text()')        #笔画
        link_content = content.xpath('//div[@class="tab-content"]/a/text()')         #相关组词

        dic1["关键词"] = kw
        dic1['拼音'] = link_list_pinyin
        dic1["释义"] = link_list1
        dic1["近义词"] = link_synonym
        dic1["反义词"] = link_antonym
        dic1["部首"] = link_redical
        dic1["笔画"] = link_stroke
        dic1["相关组词"] = link_content
    else:
        #获取详细信息
        link_list1 = content.xpath('//div//p/text()')

        link_list_pinyin = content.xpath('//div/dl/dt[@class="pinyin"]/text()')     #拼音
        link_synonym = content.xpath('//div[@id="synonym"]//a/text()')              #近义词
        link_antonym = content.xpath('//div[@id="antonym"]//a/text()')              #反义词


        dic1["关键词"] = kw
        dic1['拼音'] = link_list_pinyin
        dic1["释义"] = link_list1
        dic1["近义词"] = link_synonym
        dic1["反义词"] = link_antonym

    save_file(dic1)

def save_file(dic):           #写入文件
    json_str = json.dumps(dic, ensure_ascii=False, indent=4)
    with open("result.json","a",encoding="utf8") as file1:
        file1.write(json_str)



if __name__ == "__main__":
    """输入要搜索的关键词和对应的url地址"""
    # 连接数据库
    conn = pymysql.connect("localhost", "root", "123456", "sys")
    cursor = conn.cursor()
    sql = "select WORD from bucong"
    cursor.execute(sql)
    results = cursor.fetchall()
    # kw = input("请输入要搜索的关键词: ")
    for row in results[426:]:
        kw = row[0]
        print(kw)
        word = {"wd":kw}
        key = urllib.parse.urlencode(word)
        url = "https://hanyu.baidu.com/s"
        fullurl = url + "?" + key + "&ptype=zici"
        get_baidu_page(kw, fullurl)
        try:
            doSomething()
        except:
            pass

运行效果

在这里插入图片描述

文章来源互联网,如有侵权,请联系管理员删除。邮箱:417803890@qq.com / QQ:417803890

微配音

Python Free

邮箱:417803890@qq.com
QQ:417803890

皖ICP备19001818号
© 2019 copyright www.pythonf.cn - All rights reserved

微信扫一扫关注公众号:

联系方式

Python Free