selenium爬虫实例

warning: 这篇文章距离上次修改已过909天,其中的内容可能已经有所变动。
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import pymongo

'''链接mongodb'''
client=pymongo.MongoClient("mongodb://*********:27017")
db=client.test
col=db.taobao

'''连接mysql'''
db=pymysql.Connect(host='*********',user='root',password='*********',port=3306,db='xiaomishop')
cursor=db.cursor()
table='taobao'

'''初始url'''
kw='iPad'    #关键词
url='https://s.taobao.com/search?q='+kw

driver=webdriver.Chrome()  #指定浏览器对象
wait=WebDriverWait(driver,50)  #最大等待时长

def Get_page(page_num):
    '''
    获取page_num页的页面信息(html)
    :param page_num:
    :return: Null
    '''
    print('='*25,'正在爬取第',page_num,'页','='*25)
    try:
        driver.get(url)
        input = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'J_Input')))  # 直到定位到搜索框
        submit=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'J_Submit')))    #直到定位到能点击‘确定’
        input.clear()
        input.send_keys(page_num)
        submit.click()
        Get_product()
    except TimeoutError:
        Get_page(page_num)

def Get_product():
    '''
    获取每页上所有商品的名称、价格、图片、购买量、店铺名称、店铺位置
    :return:Null
    '''
    html=driver.page_source
    doc=pq(html)
    items=doc('.J_MouserOnverReq').items()
    for item in items:
        product={
            'name': item.find('.ctx-box .row-2 a').text(),
            'pic_img':item.find('.pic-box-inner .J_Itemlist_Pic_657878599145').attr('src'),
            'price':item.find('.ctx-box .row-1 strong').text(),
            'nums':item.find('.ctx-box .row-1 .deal-cnt').text(),
            'shop':item.find('.ctx-box .row-3 .shop span').text(),
            'location':item.find('.ctx-box .row-3 .location').text()
        }
        # write_to_mongodb(product)
        # write_to_mysql(product)
        print(product)

def write_to_mongodb(dic):
    '''
    字典结果保存到MongoDB
    :param dic:
    :return: Null
    '''
    col.insert_one(dic)

def write_to_mysql(dic):
    '''
    字典结果写入mysql
    :param dic:
    :return:
    '''
    keys=','.join(dic.keys())
    values=','.join(['%s']*len(dic))
    sql='insert into {table}({keys}) values ({values})'.format(table=table,keys=keys,values=values)
    try:
        if cursor.execute(sql,tuple(dic.values())):
            print('successful')
            db.commit()
    except:
        print('failed')
        db.rollback()


if __name__ == '__main__':
    for page_num in range(1,101):
        Get_page(page_num)

添加新评论