图片爬虫(小改动)

warning: 这篇文章距离上次修改已过545天,其中的内容可能已经有所变动。
'''
网站网址:http://www.meinuzi.com/
'''

import random
from pyquery import PyQuery as pq
import requests

proxies=[
         {'HTTP':'http://122.140.5.115:9999'},
         {'HTTP':'http://113.101.96.66:8080'},
         {'HTTP':'http://113.124.86.24:9999'},
         {'HTTP':'http://121.13.252.58:41564'},
         {'HTTP':'http://61.216.185.88:60808'},
         {'HTTP':'http://58.20.184.187:9091'},
         {'HTTP':'http://183.236.123.242:8060'},
         {'HTTP':'http://116.9.163.205:58080'},
         {'HTTP':'http://222.74.73.202:42055'},
         {'HTTP':'http://183.247.202.208:30001'},
         {'HTTP':'http://39.108.101.55:1080'},
         {'HTTP':'http://47.105.91.226:8118'},
         ]
proxie=random.choice(proxies)
user_agent_list = ['Chrome/86.0.4240.198',
                   'Chrome/101.0.4951.64',
                   'Chrome/96.0.4664.45',
                   'Chrome/94.0.4606.41'
                   ]
headers = {'User-Agent': random.choice(user_agent_list),
           'Referer':'http://www.meinuzi.com'}

'''
请求url,返回HTML
'''
def Req_url(url):
    try:
        r=requests.get(url,headers=headers,proxies=proxie)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        # print('ERROR!')
        return 'over'

'''
解析初始页面,找到图片页面的网址
'''
def Parse_page(html):
    doc=pq(html)
    items=doc('.m-list-main li').items()
    i=1
    for item in items:
        page_url=item('.u-img a').attr('href')
        name=item('.u-img img').attr('alt')
        print("保存第",i,'张图')
        Get_img(page_url,name)
        i+=1

'''
解析图片页面的网址,找到每张图片的url
'''
def Get_img(page_url,name):
    html=Req_url(page_url)
    doc=pq(html)
    img_baseurl=doc('.g-mn .m-list-content img').attr('src')
    Save_img(img_baseurl,name)

'''
根据得到的图片url,保存到文件夹
'''
def Save_img(img_baseurl,name):
    img_url=headers['Referer']+img_baseurl
    data=requests.get(img_url,headers=headers,proxies=proxie,timeout=15).content
    houzui=img_url.split('.')[-1]
    with open('./images_1/{}.{}'.format(name,houzui),'wb') as f:
        f.write(data)


def main():

    num = 7  # 从第num页开始

    '''
    当返回值为‘over’的时候,说明页面不存在,即已经爬取全部页面
    '''
    while True:
        page_url='http://www.meinuzi.com/index_'+str(num)+'.html'

        html = Req_url(page_url)
        if html != 'over':
            '''
            页面存在但无图片数据,标签中只有换行符时
            '''
            doc=pq(html)
            item = doc('.m-list-main ul').text()
            if item!='' \
                     '':
                print('**********第',num,'页**********')
                Parse_page(html)
                num+=1
            else:
                print('程序结束')
                return '程序结束'
        else:
            print('程序结束')
            return '程序结束'
main()






仅有一条评论

  1. 表评论3799

添加新评论