1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
import time
import urllib.parse
import urllib.request
import random
import requests
import parsel
import csv
from tqdm import tqdm
# 爬取内容写入csv文件
def download_csv(title_str, name, otherStyleTime, article_url):
dit = {
'标题': title_str,
'公众号': name,
'文章发布时间': otherStyleTime,
'文章地址': article_url,
}
csv_writer.writerow(dit)
# print(title_str, name, otherStyleTime, article_url)
# 解析响应内容
def parse_lable(response):
selector = parsel.Selector(response.text)
lis = selector.css('.news-list li')
for li in lis:
title_list = li.css('.txt-box h3 a::text').getall()
num = len(title_list)
if num == 1:
title_str = keyword + title_list[0]
else:
title_str = keyword.join(title_list)
href = li.css('.txt-box h3 a::attr(href)').get()
article_url = 'https://weixin.sogou.com' + href
name = li.css('.s-p a::text').get()
date = li.css('.s-p::attr(t)').get()
timeArray = time.localtime(int(date))
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
download_csv(title_str, name, otherStyleTime, article_url)
# 访问搜狗视频获取最新Cookie
def get_new_cookies():
# 搜狗视频url
url = 'https://v.sogou.com/v?ie=utf8&query=&p=40030600'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
rst = requests.get(url=url,
headers=headers,
allow_redirects=False)
cookies = rst.cookies.get_dict()
return cookies.get('SNUID')
if __name__ == '__main__':
keyword = input("请输入检索内容:")
f = open(f'公众号文章_' + keyword + '.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=['标题', '公众号', '文章发布时间', '文章地址'])
csv_writer.writeheader()
SNUID=get_new_cookies()
headers = {
'Cookie': f'IPLOC=CN5101; SUID=A08DB0753822910A0000000062D4E15C; SUV=1658118495598633; '
f'ppinf=5|1658121514|1659331114'
f'|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTozNjolRTglODclQUElRTUlOUMlQTglRTclOEIlQUMlRTglQTElOEN8Y3J0OjEwOjE2NTgxMjE1MTR8cmVmbmljazozNjolRTglODclQUElRTUlOUMlQTglRTclOEIlQUMlRTglQTElOEN8dXNlcmlkOjQ0Om85dDJsdU5QS01HS01jX2o5WjM5aG5XY1I4d29Ad2VpeGluLnNvaHUuY29tfA; pprdig=IjtTyvBfu3HPN0JuCq-GC-ua97U2qrVaL1lKmQcS5ISR8XkUoZxSWujpBlyP0zSHSItfzFGPWnzTuIhBHYtUwY-PI6COy-6I8RcER_mopugTqcBlwdO-Sc_p7j8a51jQaet93CEJ-MdpiSceBG_Nmu5cLTPN5XhC_3B92CSovak; ppinfo=80456da5d9; passport=5|1658121514|1659331114|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTozNjolRTglODclQUElRTUlOUMlQTglRTclOEIlQUMlRTglQTElOEN8Y3J0OjEwOjE2NTgxMjE1MTR8cmVmbmljazozNjolRTglODclQUElRTUlOUMlQTglRTclOEIlQUMlRTglQTElOEN8dXNlcmlkOjQ0Om85dDJsdU5QS01HS01jX2o5WjM5aG5XY1I4d29Ad2VpeGluLnNvaHUuY29tfA|242a929d54|IjtTyvBfu3HPN0JuCq-GC-ua97U2qrVaL1lKmQcS5ISR8XkUoZxSWujpBlyP0zSHSItfzFGPWnzTuIhBHYtUwY-PI6COy-6I8RcER_mopugTqcBlwdO-Sc_p7j8a51jQaet93CEJ-MdpiSceBG_Nmu5cLTPN5XhC_3B92CSovak; sgid=29-55943661-AWLU7Sr6zRRQxuX98MZMHMw; wuid=1658155657696; VIDEO_DEBUG=off; SNUID={SNUID}; ariaDefaultTheme=undefined; ppmdig=16581727390000000ba8e97bdbf22bfcefd4d50f43d86a61',
'Host': 'weixin.sogou.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36',
}
# proxies = {
# 'http':'http://113.222.29.169:22008',
# 'https': 'https://113.222.29.169:22008',
# }
print('-------------开始爬取-------------')
for page in tqdm(range(1000 + 1)):
url = f'https://weixin.sogou.com/weixin?query={urllib.parse.quote(keyword)}&_sug_type_=&s_from=input&_sug_=n' \
f'&type=2&page={page}&ie=utf8 '
try:
response = requests.get(url=url, headers=headers)
parse_lable(response)
# 控制爬取频率
# time.sleep(random.randint(8, 10))
except KeyboardInterrupt:
print('\n-----------------操作中止---------------------')
print('-------------爬取完成------------')
|