IP代理池

warning: 这篇文章距离上次修改已过507天,其中的内容可能已经有所变动。
import requests
from bs4 import BeautifulSoup
import time
'''
IP代理网站的页数
'''
num=51


list_ip = []
list_port = []
list_type=[]
list_headers_ip = []


def check_ip(list_ip):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 Edg/91.0.864.71',
        'Connection': 'close',
        'Referer': 'https://m.7160.top/rentiyishu/'
    }
    # url = 'https://www.baidu.com'  # 以百度为例,检测IP的可行性
    url = 'https://pic.99ym.cn/d/file/202009/mz1kqud4v2i.jpg'

    can_use = []
    for ip in list_ip:
        try:
            response = requests.get(url=url, headers=headers, proxies=ip, timeout=3, verify=False)  # 在0.1秒之内请求百度的服务器
            if response.status_code == 200:
                can_use.append(ip)
        except Exception as e:
            print(e)

    return can_use


for start in range(1, num):

    url = 'https://www.kuaidaili.com/free/inha/{}/'.format(start)  # 每页15个数据,共爬取10页
    print("正在处理url: ", url)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 Edg/91.0.864.71'}
    response = requests.get(url=url, headers=headers)

    soup = BeautifulSoup(response.text, 'html.parser')

    ip = soup.select('#list > table > tbody > tr > td:nth-child(1)')
    port = soup.select('#list > table > tbody > tr > td:nth-child(2)')
    type = soup.select('#list > table > tbody > tr > td:nth-child(4)')
    for i in ip:
        list_ip.append(i.get_text())
    for i in port:
        list_port.append(i.get_text())
    for i in type:
        list_type.append(i.get_text())
    time.sleep(0.5)  # 防止爬取太快,数据爬取不全

# 代理ip的形式:        'http':'http://119.14.253.128:8088'

for i in range(len(list_ip)):
    if list_type[i]=='HTTP':
        proxies = {
            'HTTP':'http://'+list_ip[i]+':'+list_port[i]
        }
    else:
        proxies = {
            'HTTPS': 'https://' + list_ip[i] + ':' + list_port[i]
        }
    list_headers_ip.append(proxies)
can_use = check_ip(list_headers_ip)
print('能用的代理IP为:', can_use)
print('能用的代理IP数量为:', len(can_use))

with open('./IP代理池.txt','w') as f:
    for i in can_use:
        f.write(str(i)+'\n')
f.close()
最后修改于:2022年12月30日 12:53

添加新评论