Ajax数据爬取
warning:
这篇文章距离上次修改已过765天,其中的内容可能已经有所变动。
from urllib.parse import urlencode
import requests
import re
def Get_page(page_num):
headers={
'Host':'so.toutiao.com',
'user-agent':'Mozilla/5.0',
'cookie':'tt=acd600b5ac865033f0ee83a63ed44675; '
}
base_url='https://so.toutiao.com/search?'
params = {
'keyword': '街拍',
'pd': 'atlas',
'dvpf': 'pc',
'aid': '4916',
'page_num': page_num,
'search_json': '%7B%22from_search_id%22%3A%2220220104115420010212192151532E8188%22%2C%22origin_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%2C%22image_keyword%22%3A%22%E8%A1%97%E6%8B%8D%22%7D',
'rawJSON': 1,
'search_id': '2022062517173701021219402539E36546'
}
url=base_url+urlencode(params)
try:
r=requests.get(url,headers=headers)
if r.status_code==200:
return r.json()
except:
print('Failed!')
def get_images(json):
images = json['rawData']['data']
for image in images:
yield {
'name': image['text'],
'url':image['img_url']
}
def save_image(dict):
data=requests.get(dict['url']).content
#把“和 :都替换成‘,’,否则python路径会识别错误
name=dict['name'].replace(':',',')
name=name.replace('"',',')
name=name.replace('?','')
name=name.split('|')[-1] #碰到‘|’就取最后一个
houzui=dict['url'].split('.')[-1] #截取后缀,比如JPEG,png等图片的格式名称
with open("./images/{}.{}".format(name,houzui),'wb') as f:
f.write(data)
def main():
for page_num in range(1,20):
json=Get_page(page_num)
dict=get_images(json)
for d in dict:
save_image(d)
main()