Python爬虫 pyppeteer 百度首页交互|无头模式和有头模式

发表时间:2020-02-22

Snipaste_2020-02-22_13-23-01.png

百度首页交互有头模式代码:

import time
import asyncio
from pyppeteer import launch
 
async def main():
    browser = await launch(headless=False)
    page = await browser.newPage()
    await page.setViewport({'width': 1200, 'height': 800})
    await page.goto('https://www.baidu.com')
    # 在搜索框中输入python
    await page.type('input#kw.s_ipt','python')
    # 点击搜索按钮
    await page.click('input#su')
    
    # 等待元素加载,第一种方法,强行等待5秒
    # await asyncio.sleep(5)
    
    # 第二种方法,在while循环里强行查询某元素进行等待
    while not await page.querySelector('.t'):
        pass
 
    # 滚动到页面底部
    await page.evaluate('window.scrollBy(0, window.innerHeight)')
 
    # 这些等待方法都不好用
    # await page.waitForXPath('h3', timeout=300)
    # await page.waitForNavigation(waitUntil="networkidle0")
    # await page.waitForFunction('document.getElementByTag("h3")')
    # await page.waitForSelector('.t')
    # await page.waitFor('document.querySelector("#t")')
    # await page.waitForNavigation(waitUntil='networkidle0')
    # await page.waitForFunction('document.querySelector("").inner‌​Text.length == 7')
 
    title_elements = await page.xpath('//h3[contains(@class,"t")]/a')
    for item in title_elements:
        title_str = await (await item.getProperty('textContent')).jsonValue()
        print(title_str)
    await browser.close()
 
asyncio.get_event_loop().run_until_complete(main())

百度首页交互无头模式代码:

import asyncio
import pyppeteer
from collections import namedtuple
 
Response = namedtuple("rs", "title url html cookies headers history status")
 
 
async def get_html(url, timeout=30):
    # 默认30s
    browser = await pyppeteer.launch(headless=True, args=['--no-sandbox'])
    page = await browser.newPage()
    res = await page.goto(url, options={'timeout': int(timeout * 1000)})
    data = await page.content()
    title = await page.title()
    resp_cookies = await page.cookies()
    resp_headers = res.headers
    resp_history = None
    resp_status = res.status
    response = Response(
        title=title,
        url=url,
        html=data,
        cookies=resp_cookies,
        headers=resp_headers,
        history=resp_history,
        status=resp_status
    )
    return response
 
 
if __name__ == '__main__':
    url_list = [
        "http://www.10086.cn/index/tj/index_220_220.html",
        "http://www.10010.com/net5/011/",
        # "http://python.jobbole.com/87541/"
    ]
    task = (get_html(url) for url in url_list)
 
    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(asyncio.gather(*task))
    for res in results:
        print(res.title)

微配音

文章来源互联网,尊重作者原创,如有侵权,请联系管理员删除。邮箱:417803890@qq.com / QQ:417803890


Python Free

邮箱:417803890@qq.com
QQ:417803890

皖ICP备19001818号
© 2019 copyright www.pythonf.cn - All rights reserved

微信扫一扫关注公众号:

联系方式

Python Free