爬虫采集A股在百度上的指数表现,可以反应对应A股在网上的具体真实搜索量,也可以采集其他关键词的python爬取百度指数程序 - 文章中心
爬虫采集A股在百度上的指数表现,可以反应对应A股在网上的具体真实搜索量,也可以采集其他关键词的python爬取百度指数程序
2024-12-16
import requests import json from datetime import date, timedelta import pandas as pd class DownloadBaiDuIndex(object): def __init__(self, cookie): self.cookie = cookie self.headers = { "Connection": "keep-alive", "Accept": "application/json, text/plain, */*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",

爬虫采集A股在百度上的指数表现,可以反应对应A股在网上的具体真实搜索量,也可以采集其他关键词的python爬取百度指数程序

"Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://index.baidu.com/v2/main/index.html", "Accept-Language": "zh-CN,zh;q=0.9", 'cookie': self.cookie, "Host": "index.baidu.com", "X-Requested-With": "XMLHttpRequest", "Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==", } def decrypt(self, ptbk, index_data): n = len(ptbk) // 2 a = dict(zip(ptbk[:n], ptbk[n:])) return "".join([a[s] for s in index_data]) def get_index_data_json(self, keys, start=None, end=None): words = [[{"name": key, "wordType": 1}] for key in keys] words = str(words).replace(" ", "").replace("'", """) url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}' print(words, start, end) res = requests.get(url, headers=self.headers) data = res.json()['data'] uniqid = data['uniqid'] url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}' res = requests.get(url, headers=self.headers) ptbk = res.json()['data'] result = {} result["startDate"] = start result["endDate"] = end for userIndexe in data['userIndexes']: name = userIndexe['word'][0]['name'] tmp = {} index_all = userIndexe['all']['data'] index_all_data = [int(e) for e in self.decrypt(ptbk, index_all).split(",")] tmp["all总共"] = index_all_data index_pc = userIndexe['pc']['data'] index_pc_data = [int(e) for e in self.decrypt(ptbk, index_pc).split(",")] tmp["pc电脑端"] = index_pc_data index_wise = userIndexe['wise']['data'] index_wise_data = [int(e) for e in self.decrypt(ptbk, index_wise).split(",")] tmp["wise移动端"] = index_wise_data result[name] = tmp return result def GetIndex(self, keys, start=None, end=None): today = date.today() if start is None: start = str(today - timedelta(days=8)) if end is None: end = str(today - timedelta(days=2)) try: raw_data = self.get_index_data_json(keys=keys, start=start, end=end) raw_data = pd.Dataframe(raw_data[keys[0]]) raw_data.index = pd.date_range(start=start, end=end) except Exception as e: print(e) raw_data = pd.Dataframe({'all总共': [], 'pc电脑端': [], 'wise移动端': []}) finally: return raw_data cookie = 'BIDUPSID=C869009149BF46790B56A24CFE042937; PSTM=1658471568; BAIDUID=C869009149BF4679693D97802F6F139F:FG=1; H_PS_PSSID=36833_36545_36465_36726_36454_36414_36690_36166_36816_36569_36802_36653_36745_26350_36861; BA_HECTOR=81aka125048g852l2g070fbf1hdkh4h16; ZFY=ZfuszGk3:AbZSB:AbF9Fo9EWD2CmhvSuC7qZlt3M3Q0ic:C; BAIDUID_BFESS=6F01237E24848C993C6D3F0B4698B829:FG=1; delPer=0; PSINO=6; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BCLID=11514916526111745447; BDSFRCVID=AAFOJexroG0leprDd4Pt8n7CropWxY5TDYrELPfiaimDVu-VJeC6EG0Pts1-dEu-EHtdogKKKmOTHc-F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR30WJbHMTrDHJTg5DTjhPrMLtQlWMT-MTryKKJs54JKsb-m0b8Wh6-dKxnjLbvkJGnRh4oNBUJtjJjYhfO45DuZyxomtfQxtNRJQKDE5p5hKq5S5-OobUPUyUc9LUv2Hmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLK-oj-DIlj55P; BCLID_BFESS=11514916526111745447; BDSFRCVID_BFESS=AAFOJexroG0leprDd4Pt8n7CropWxY5TDYrELPfiaimDVu-VJeC6EG0Pts1-dEu-EHtdogKKKmOTHc-F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR30WJbHMTrDHJTg5DTjhPrMLtQlWMT-MTryKKJs54JKsb-m0b8Wh6-dKxnjLbvkJGnRh4oNBUJtjJjYhfO45DuZyxomtfQxtNRJQKDE5p5hKq5S5-OobUPUyUc9LUv2Hmcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLK-oj-DIlj55P; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1658471953; BDUSS=WR2a3k4Q34tTTZ5V01-Mm4yM0NBTVV2TXhpRmlkVExzMW1pTzZOTjVsd1Uwd0ZqRVFBQUFBJCQAAAAAAAAAAAEAAAD0tgQLY2p4Nzk0OQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABRG2mIURtpid; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04084844066YhnidITU4R5uJmyZkN8xoBYzHy7wD/AAkwO8G70oN3Mfa7PN8MYP+/IUjI3jKW+94F+AjIn3KnOH9Gc1DY6jrnLKyfi1Mv6ciOBBTNl2gGgbogzPj78tKLE2KJ5B9UT3AhXfQv6ZlD3Rlz788r9jCeyhyDlleYGZj7Asv0q8ho5cLpWtML60wvhnYzjF/J1zPA5EuiBZaOaLcUjQfYmwJzHcJOeu7QGV/d8DPPsZYIkU1zslfGzbz73PIdhiudRioC4O4bF8roCztUhkUOcvyw==45188015791503109006885864329534; __cas__rn__=408484406; __cas__st__212=c9de5887ddfe52fdafae9d5993b717102b6134082e365b21c12e7f5e1a409f084fc24942f44c45fafa36241a; __cas__id__212=41208052; CPID_212=41208052; CPTK_212=431331100; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc={"uid_":{"value":"184858356","scope":1}}; bdindexid=vm6eikc24agnmg5pjunc6eair1; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1658472033; ab_sr=1.0.1_Y2JhMDg1MWEzZGY0NGNlZjJiYzQ0YTk3OWEzYjVjYzg4MTE0NjM4MWMwZTczZTBjMjUwOWNhYWZhMDgzMGU5NTgxNzQzMzczZGVmOTJlM2ViZDE1ZTQ0YTdjYjkyNmFlM2QzMWE0OGM0MTUwMTcxNmFiYzgxMzgwOTliYjdiMmQ1MzNjMmZkYWFmODg5ZTA4NWUyZjA0Y2MyMjk5N2NjYg==; BDUSS_BFESS=WR2a3k4Q34tTTZ5V01-Mm4yM0NBTVV2TXhpRmlkVExzMW1pTzZOTjVsd1Uwd0ZqRVFBQUFBJCQAAAAAAAAAAAEAAAD0tgQLY2p4Nzk0OQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABRG2mIURtpid; RT="z=1&dm=baidu.com&si=8f410c95-43c2-464c-afbc-e5d67e79f168&ss=l5w3do7k&sl=k&tt=thv&bcn=https://fclog.baidu.com/log/weirwood?type=perf"' #cookie里面的值在网页里登录然后刷新后Network里面sug?inputword里面找到 # 初始化一个类 downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie) A=['中国宝安','神州数码','中钨高新','东信和平','联创电子',] for XXXX in A: data = downloadbaiduindex.GetIndex(keys=[XXXX], start='2022-02-28', end='2022-07-26') data data.to_csv(str(XXXX)+'.csv')#这里最开始格式是csv,我改为xlsx然后分列和采集交易量相配合,
  I   II   III   IV