Python Selenium 新浪博客自动发帖|循环爬取指定文章页面-cookies登录-自动在新浪博客发帖

发表时间:2019-12-20

效果如下:

Rec 0004.gif


完整脚本:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# 导入urllib中的request模块,用来发送http/https请求
from urllib import request, parse
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import json
import time

n=36

def getinnerhtml(data):
return data[data.find(">") + 1:data.rfind("</")]

def urlaction(n):
for i in range(n, n-10, -1):
url='https://www.*******.cn/read/'+str(i)
print(url)
get_data(url)
# 获取数据
def get_data(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
if response.getcode() == 200:
data = response.read()
data = str(data, encoding="utf-8")
bs = BeautifulSoup(data, 'html.parser')
title = bs.select("#neirong h1")[0].get_text()
content = bs.select("#zhengwen")[0]
content = content.prettify()
content = re.sub(r'width="\d+"', 'width="100%"', content)
content = re.sub(r'height="\d+"', '', content)
content = content.replace('</h1>', '</h1><br>')
content = content.replace('src="/uploads/editor/', 'src="https://www.******.cn/uploads/editor/')
photo = '<img alt="www.******.cn" style="width: 100%" src="https://www.******.cn/public/photo/yejiao.jpg"/>'
alink = '<a href="https://www.ls27.cn/">科技口袋:www.ls27.cn</a>'
content = getinnerhtml(content) + photo + alink
neirong = title,content
sinaaction(neirong)

def sinalogin():
driver = webdriver.Chrome(executable_path="D:/chromedriver/chromedriver.exe")
driver.get('http://blog.sina.com.cn')
time.sleep(30)
cookies = driver.get_cookies()
with open(r"D:/test_cookies/sinablog.txt", 'w') as file1:
for cookie in cookies:
file1.write(json.dumps(cookie) + "\n")

def sinaaction(neirong):
driver = webdriver.Chrome(executable_path="D:/chromedriver/chromedriver.exe")
driver.get('http://blog.sina.com.cn')
driver.delete_all_cookies()
with open(r"D:/test_cookies/sinablog.txt", 'r') as file2:
res = file2.readlines()
for line in res:
cookie = json.loads(line)
if "expiry" in cookie:
del cookie["expiry"]
driver.add_cookie(cookie)
driver.get('http://blog.sina.com.cn')
driver.find_element_by_class_name('write-blog').click()
time.sleep(2)
n = driver.window_handles
driver.switch_to.window(n[1])
driver.find_element_by_id('SinaEditor_59_viewcodecheckbox').click()
driver.find_element_by_id('articleTitle').send_keys(neirong[0])
driver.find_element_by_id('SinaEditorTextarea').send_keys(neirong[1])
driver.find_element_by_id('SinaEditor_59_viewcodecheckbox').click()
driver.find_element_by_id('input3').click()
driver.find_element_by_id('componentSelect').click()
driver.find_element_by_xpath('//*[@id="componentSelect"]/option[3]').click()
driver.find_element_by_id('articlePostBtn').click()
time.sleep(60)


if __name__ == "__main__":
urlaction(n)

循环抓取数码评测站的帖子,并且修改格式

用cookies自动登录新浪博客,把抓取的内容自动输入,选择好分类,选择好同步到微博

输入内容选择html编辑,用html代码输入最方便,格式和图片都方便的输入进去了

其他的细节内容本帖不做过多解释了,会另开贴说明


Python Free

邮箱:417803890@qq.com
QQ:417803890

皖ICP备19001818号
© 2019 copyright www.pythonf.cn - All rights reserved

微信扫一扫关注公众号:

联系方式

Python Free