pyppeteer 爬取京东商城和淘宝示例代码|获取cookie爬取搜索内容

发表时间:2020-02-22

爬取京东商城示例代码:

import requests
from bs4 import BeautifulSoup
from pyppeteer import launch
import asyncio
 
 
def screen_size():
    """使用tkinter获取屏幕大小"""
    import tkinter
    tk = tkinter.Tk()
    width = tk.winfo_screenwidth()
    height = tk.winfo_screenheight()
    tk.quit()
    return width, height
 
 
async def main(url):
    # browser = await launch({'headless': False, 'args': ['--no-sandbox'], })
    browser = await launch({'args': ['--no-sandbox'], })
    page = await browser.newPage()
    width, height = screen_size()
    await page.setViewport(viewport={"width": width, "height": height})
    await page.setJavaScriptEnabled(enabled=True)
    await page.setUserAgent(
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
    )
    await page.goto(url)
 
    # await asyncio.sleep(2)
 
    await page.evaluate('window.scrollBy(0, document.body.scrollHeight)')
 
    await asyncio.sleep(1)
 
    # content = await page.content()
    li_list = await page.xpath('//*[@id="J_goodsList"]/ul/li')
 
    # print(li_list)
    item_list = []
    for li in li_list:
        a = await li.xpath('.//div[@class="p-img"]/a')
        detail_url = await (await a[0].getProperty("href")).jsonValue()
        promo_words = await (await a[0].getProperty("title")).jsonValue()
        a_ = await li.xpath('.//div[@class="p-commit"]/strong/a')
        p_commit = await (await a_[0].getProperty("textContent")).jsonValue()
        i = await li.xpath('./div/div[3]/strong/i')
        price = await (await i[0].getProperty("textContent")).jsonValue()
        em = await li.xpath('./div/div[4]/a/em')
        title = await (await em[0].getProperty("textContent")).jsonValue()
        item = {
            "title": title,
            "detail_url": detail_url,
            "promo_words": promo_words,
            'p_commit': p_commit,
            'price': price
        }
        item_list.append(item)
        # print(item)
        # break
    # print(content)
 
    await page_close(browser)
    return item_list
 
 
async def page_close(browser):
    for _page in await browser.pages():
        await _page.close()
    await browser.close()
 
 
msg = "手机"
url = "https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq={}&cid2=653&cid3=655&page={}"
 
task_list = []
for i in range(1, 6):
    page = i * 2 - 1
    url = url.format(msg, msg, page)
    task_list.append(main(url))
 
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*task_list))
# print(results, len(results))
for i in results:
    print(i, len(i))
 
print('*' * 100)
# soup = BeautifulSoup(content, 'lxml')
# div = soup.find('div', id='J_goodsList')
# for i, li in enumerate(div.find_all('li', class_='gl-item')):
#     if li.select('.p-img a'):
#         print(li.select('.p-img a')[0]['href'], i)
#         print(li.select('.p-price i')[0].get_text(), i)
#         print(li.select('.p-name em')[0].text, i)
#     else:
#         print("#" * 200)
#         print(li)


爬取淘宝示例代码:

# -*- coding: utf-8 -*-
 
import time
import random
import asyncio
from retrying import retry  # 错误自动重试
from pyppeteer.launcher import launch
 
 
js1 = '''() =>{Object.defineProperties(navigator,{ webdriver:{ get: () => false}})}'''
js2 = '''() => {alert(window.navigator.webdriver)}'''
js3 = '''() => {window.navigator.chrome = {runtime: {}, }; }'''
js4 = '''() =>{Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});}'''
js5 = '''() =>{Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5,6],});}'''
 
 
def retry_if_result_none(result):
    return result is None
 
 
@retry(retry_on_result=retry_if_result_none, )
async def mouse_slide(page=None):
    await asyncio.sleep(3)
    try:
        await page.hover('#nc_1_n1z')
        await page.mouse.down()
        await page.mouse.move(2000, 0, {'delay': random.randint(1000, 2000)})
        await page.mouse.up()
 
    except Exception as e:
        print(e, '     :slide login False')
        return None
    else:
        await asyncio.sleep(3)
        slider_again = await page.Jeval('.nc-lang-cnt', 'node => node.textContent')
        if slider_again != '验证通过':
            return None
        else:
            await page.screenshot({'path': './headless-slide-result.png'})
            print('验证通过')
            return 1
 
 
def input_time_random():
    return random.randint(100, 151)
 
 
def screen_size():
    """使用tkinter获取屏幕大小"""
    import tkinter
    tk = tkinter.Tk()
    width = tk.winfo_screenwidth()
    height = tk.winfo_screenheight()
    tk.quit()
    return width, height
 
 
async def main(username, pwd, url):
    browser = await launch(
        {'headless': False, 'args': ['--no-sandbox'], },
        userDataDir='./userdata',
        args=['--window-size=1366,768']
    )
    page = await browser.newPage()
    width, height = screen_size()
    await page.setViewport(viewport={"width": width, "height": height})
    await page.setUserAgent(
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299'
    )
 
    await page.goto(url)
    await page.evaluate(js1)
    await page.evaluate(js3)
    await page.evaluate(js4)
    await page.evaluate(js5)
 
    pwd_login = await page.querySelector('.J_Quick2Static')
    # print(await (await pwd_login.getProperty('textContent')).jsonValue())
    await pwd_login.click()
 
    await page.type('#TPL_username_1', username, {'delay': input_time_random() - 50})
    await page.type('#TPL_password_1', pwd, {'delay': input_time_random()})
 
    await page.screenshot({'path': './headless-test-result.png'})
    time.sleep(2)
 
    slider = await page.Jeval('#nocaptcha', 'node => node.style')  # 是否有滑块
 
    if slider:
        print('出现滑块情况判定')
        await page.screenshot({'path': './headless-login-slide.png'})
        flag = await mouse_slide(page=page)
        if flag:
            print(page.url)
            await page.keyboard.press('Enter')
            await get_cookie(page)
    else:
        await page.keyboard.press('Enter')
        await page.waitFor(20)
        await page.waitForNavigation()
        try:
            global error
            error = await page.Jeval('.error', 'node => node.textContent')
        except Exception as e:
            error = None
            print(e, "错啦")
        finally:
            if error:
                print('确保账户安全重新入输入')
            else:
                print(page.url)
                # 可继续网页跳转 已经携带 cookie
                # await get_search(page)
                await get_cookie(page)
    await page_close(browser)
 
 
async def page_close(browser):
    for _page in await browser.pages():
        await _page.close()
    await browser.close()
 
 
async def get_search(page):
    # https://s.taobao.com/search?q={查询的条件}&p4ppushleft=1%2C48&s={每页 44 条 第一页 0 第二页 44}&sort=sale-desc
    await page.goto("https://s.taobao.com/search?q=气球")
 
    await asyncio.sleep(5)
    # print(await page.content())
 
 
# 获取登录后cookie
async def get_cookie(page):
    res = await page.content()
    cookies_list = await page.cookies()
    cookies = ''
    for cookie in cookies_list:
        str_cookie = '{0}={1};'
        str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value'))
        cookies += str_cookie
    print(cookies)
    # 将cookie 放入 cookie 池 以便多次请求 封账号 利用cookie 对搜索内容进行爬取
    return cookies
 
 
if __name__ == '__main__':
    tb_username = '淘宝用户名'
    tb_pwd = '淘宝密码'
    tb_url = "https://login.taobao.com/member/login.jhtml"
 
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(tb_username, tb_pwd, tb_url))

利用 上面 获取到的 cookie 爬取搜索内容

示例代码:

import json
import requests
import re
 
# 设置 cookie 池 随机发送请求 通过 pyppeteer 获取 cookie
cookie = '_tb_token_=edd7e354dee53;t=fed8f4ca1946ca1e73223cfae04bc589;sg=20f;cna=2uJSFdQGmDMCAbfFWXWAC4Jv;cookie2=1db6cd63ad358170ea13319f7a862c33;_l_g_=Ug%3D%3D;v=0;unb=3150916610;skt=49cbfd5e01d1b550;cookie1=BxVRmD3sh19TaAU6lH88bHw5oq%2BgcAGcRe229Hj5DTA%3D;csg=cf45a9e2;uc3=vt3=F8dByEazRMnQZDe%2F9qI%3D&id2=UNGTqfZ61Z3rsA%3D%3D&nk2=oicxO%2BHX4Pg%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D;existShop=MTU1Njg3MDM3MA%3D%3D;tracknick=%5Cu7433150322;lgc=%5Cu7433150322;_cc_=V32FPkk%2Fhw%3D%3D;mt=ci=86_1;dnk=%5Cu7433150322;_nk_=%5Cu7433150322;cookie17=UNGTqfZ61Z3rsA%3D%3D;tg=0;enc=tThHs6Sn3BAl8v1fu3J4tMpgzA1n%2BLzxjib0vDAtGsXJCb4hqQZ7Z9fHIzsN0WghdcKEsoeKz6mBwPUpyzLOZw%3D%3D;JSESSIONID=B3F383B3467EC60F8CA425935232D395;l=bBMspAhrveV5732DBOCanurza77OSIRYYuPzaNbMi_5pm6T_G4QOlC03xF96VjfRswYBqh6Mygv9-etuZ;hng=CN%7Czh-CN%7CCNY%7C156;isg=BLi41Q8PENDal3xUVsA-aPbfiWaKiRzB6vcTu_IpBPOmDVj3mjHsO86vxUQYW9SD;uc1=cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&cookie15=UIHiLt3xD8xYTw%3D%3D&existShop=false&pas=0&cookie14=UoTZ4ttqLhxJww%3D%3D&tag=8&lng=zh_CN;thw=cn;x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0;swfstore=34617;'
 
headers = {
    'cookie': cookie,
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
 
rep = requests.get('https://s.taobao.com/search?q=手机&p4ppushleft=1%2C48&s=0&sort=sale-desc ', headers=headers)
rep.encoding = 'utf-8'
res = rep.text
print(res)
 
r = re.compile(r'g_page_config = (.*?)g_srp_loadCss', re.S)
res = r.findall(res)
 
data = res[0].strip().rstrip(';')
dic_data = json.loads(data)
auctions = dic_data.get('mods')['itemlist']['data']['auctions']
 
# print(auctions,len(auctions))
for item in auctions[1:]:
    print(item)
    break

文章来源互联网,尊重作者原创,如有侵权,请联系管理员删除。邮箱:417803890@qq.com / QQ:417803890


Python Free

邮箱:417803890@qq.com
QQ:417803890

皖ICP备19001818号
© 2019 copyright www.pythonf.cn - All rights reserved

微信扫一扫关注公众号:

联系方式

Python Free