图片爬虫(小改动)
warning:
这篇文章距离上次修改已过761天,其中的内容可能已经有所变动。
'''
网站网址:http://www.meinuzi.com/
'''
import random
from pyquery import PyQuery as pq
import requests
proxies=[
{'HTTP':'http://122.140.5.115:9999'},
{'HTTP':'http://113.101.96.66:8080'},
{'HTTP':'http://113.124.86.24:9999'},
{'HTTP':'http://121.13.252.58:41564'},
{'HTTP':'http://61.216.185.88:60808'},
{'HTTP':'http://58.20.184.187:9091'},
{'HTTP':'http://183.236.123.242:8060'},
{'HTTP':'http://116.9.163.205:58080'},
{'HTTP':'http://222.74.73.202:42055'},
{'HTTP':'http://183.247.202.208:30001'},
{'HTTP':'http://39.108.101.55:1080'},
{'HTTP':'http://47.105.91.226:8118'},
]
proxie=random.choice(proxies)
user_agent_list = ['Chrome/86.0.4240.198',
'Chrome/101.0.4951.64',
'Chrome/96.0.4664.45',
'Chrome/94.0.4606.41'
]
headers = {'User-Agent': random.choice(user_agent_list),
'Referer':'http://www.meinuzi.com'}
'''
请求url,返回HTML
'''
def Req_url(url):
try:
r=requests.get(url,headers=headers,proxies=proxie)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
# print('ERROR!')
return 'over'
'''
解析初始页面,找到图片页面的网址
'''
def Parse_page(html):
doc=pq(html)
items=doc('.m-list-main li').items()
i=1
for item in items:
page_url=item('.u-img a').attr('href')
name=item('.u-img img').attr('alt')
print("保存第",i,'张图')
Get_img(page_url,name)
i+=1
'''
解析图片页面的网址,找到每张图片的url
'''
def Get_img(page_url,name):
html=Req_url(page_url)
doc=pq(html)
img_baseurl=doc('.g-mn .m-list-content img').attr('src')
Save_img(img_baseurl,name)
'''
根据得到的图片url,保存到文件夹
'''
def Save_img(img_baseurl,name):
img_url=headers['Referer']+img_baseurl
data=requests.get(img_url,headers=headers,proxies=proxie,timeout=15).content
houzui=img_url.split('.')[-1]
with open('./images_1/{}.{}'.format(name,houzui),'wb') as f:
f.write(data)
def main():
num = 7 # 从第num页开始
'''
当返回值为‘over’的时候,说明页面不存在,即已经爬取全部页面
'''
while True:
page_url='http://www.meinuzi.com/index_'+str(num)+'.html'
html = Req_url(page_url)
if html != 'over':
'''
页面存在但无图片数据,标签中只有换行符时
'''
doc=pq(html)
item = doc('.m-list-main ul').text()
if item!='' \
'':
print('**********第',num,'页**********')
Parse_page(html)
num+=1
else:
print('程序结束')
return '程序结束'
else:
print('程序结束')
return '程序结束'
main()
表评论3799
叼茂SEO.bfbikes.com
想想你的文章写的特别好https://www.237fa.com/
看的我热血沸腾啊https://www.237fa.com/
看的我热血沸腾啊https://www.237fa.com/
看的我热血沸腾啊https://www.ea55.com/
不错不错,我喜欢看 www.jiwenlaw.com