图片爬虫(代理ip)
warning:
这篇文章距离上次修改已过762天,其中的内容可能已经有所变动。
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 19 22:44:37 2022
@author: fch
"""
import random
from pyquery import PyQuery as pq
import requests
import os
import time
import json
proxies=[
{'HTTP':'http://122.140.5.115:9999'},
{'HTTP':'http://113.101.96.66:8080'},
{'HTTP':'http://113.124.86.24:9999'},
{'HTTP':'http://121.13.252.58:41564'},
{'HTTP':'http://61.216.185.88:60808'},
{'HTTP':'http://58.20.184.187:9091'},
{'HTTP':'http://183.236.123.242:8060'},
{'HTTP':'http://116.9.163.205:58080'},
{'HTTP':'http://222.74.73.202:42055'},
{'HTTP':'http://183.247.202.208:30001'},
{'HTTP':'http://39.108.101.55:1080'},
{'HTTP':'http://47.105.91.226:8118'},
]
proxie=random.choice(proxies)
user_agent_list = ['Chrome/86.0.4240.198',
'Chrome/101.0.4951.64',
'Chrome/96.0.4664.45',
'Chrome/94.0.4606.41'
]
headers = {'User-Agent': random.choice(user_agent_list)}
list1 = [
"Chrome/68.0.3440.106",
"Chrome/67.0.3396.99",
"Chrome/64.0.3282.186",
"Chrome/62.0.3202.62",
"Chrome/45.0.2454.101"
]
header={'User-Agent': random.choice(list1),
'Referer':'http://81.68.202.74/',
'sec-ch-ua': 'Google Chrome',
'sec-ch-ua-platform':'Windows'
}
base_url = 'http://81.68.202.74/datu/page/'
# verify=False忽略SSL警告
def Base_page(Page_num):
url = base_url + str(Page_num)
try:
r = requests.get(url, headers=headers,proxies=proxie)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('ERROR!!!')
def Page_url(fir_html):
doc = pq(fir_html)
items = doc('.posts-wrapper .entry-media').items()
num = 1
for item in items:
page_url=item('a').attr('href')
fil_name=item('.lazyload').attr('alt').replace(":","")
# print(fil_name,page_url)
print('------ 个数:',num,'------')
Parse_img(page_url, fil_name)
num+=1
def Parse_img(page_url,fil_name):
time.sleep(0.25)
i=1
r = requests.get(page_url, headers=headers,proxies=proxie)
r.raise_for_status()
r.encoding = r.apparent_encoding
sec_html=r.text
doc=pq(sec_html)
items=doc('.entry-content p img').items()
for item in items:
dict={
'url':item.attr('src'),
'name':item.attr('title')
}
print("保存第",i,'张图')
i+=1
time.sleep(1.25)
Save_img(dict,fil_name)
def Save_img(dic,fil_name):
url=dic['url']
# time.sleep(0.25)
data = requests.get(url,headers=header,proxies=proxie).content
name=dic['name']
houzui=url.split('.')[-1]
addr='./images_2/'+fil_name
if not os.path.isdir(addr):
os.makedirs(addr)
with open('./images_2/{}/{}.{}'.format(fil_name,name,houzui),'wb') as f:
f.write(data)
def main():
#左闭右开,到21页
for Page_num in range(1,22):
print("*************第",Page_num,'页*************')
fir_html = Base_page(Page_num)
Page_url(fir_html)
print("结束!")
main()