import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import pymongo
'''链接mongodb'''
client=pymongo.MongoClient("mongodb://*********:27017")
db=client.test
col=db.taobao
'''连接mysql'''
db=pymysql.Connect(host='*********',user='root',password='*********',port=3306,db='xiaomishop')
cursor=db.cursor()
table='taobao'
'''初始url'''
kw='iPad' #关键词
url='https://s.taobao.com/search?q='+kw
driver=webdriver.Chrome() #指定浏览器对象
wait=WebDriverWait(driver,50) #最大等待时长
def Get_page(page_num):
'''
获取page_num页的页面信息(html)
:param page_num:
:return: Null
'''
print('='*25,'正在爬取第',page_num,'页','='*25)
try:
driver.get(url)
input = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'J_Input'))) # 直到定位到搜索框
submit=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'J_Submit'))) #直到定位到能点击‘确定’
input.clear()
input.send_keys(page_num)
submit.click()
Get_product()
except TimeoutError:
Get_page(page_num)
def Get_product():
'''
获取每页上所有商品的名称、价格、图片、购买量、店铺名称、店铺位置
:return:Null
'''
html=driver.page_source
doc=pq(html)
items=doc('.J_MouserOnverReq').items()
for item in items:
product={
'name': item.find('.ctx-box .row-2 a').text(),
'pic_img':item.find('.pic-box-inner .J_Itemlist_Pic_657878599145').attr('src'),
'price':item.find('.ctx-box .row-1 strong').text(),
'nums':item.find('.ctx-box .row-1 .deal-cnt').text(),
'shop':item.find('.ctx-box .row-3 .shop span').text(),
'location':item.find('.ctx-box .row-3 .location').text()
}
# write_to_mongodb(product)
# write_to_mysql(product)
print(product)
def write_to_mongodb(dic):
'''
字典结果保存到MongoDB
:param dic:
:return: Null
'''
col.insert_one(dic)
def write_to_mysql(dic):
'''
字典结果写入mysql
:param dic:
:return:
'''
keys=','.join(dic.keys())
values=','.join(['%s']*len(dic))
sql='insert into {table}({keys}) values ({values})'.format(table=table,keys=keys,values=values)
try:
if cursor.execute(sql,tuple(dic.values())):
print('successful')
db.commit()
except:
print('failed')
db.rollback()
if __name__ == '__main__':
for page_num in range(1,101):
Get_page(page_num)
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ActionChains
'''基本使用'''
#声明浏览器对象
browser=webdriver.Chrome()
try:
'''访问页面'''
browser.get('https://www.baidu.com')
input=browser.find_element(By.ID,'kw')
input.send_keys('python')
input.send_keys(Keys.ENTER)
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
print('='*50)
print(browser.current_url)
print('=' * 50)
print(browser.get_cookies())
print('=' * 50)
# print(browser.page_source)
browser.quit()
except:
browser.close()
'''节点交互'''
driver=webdriver.Chrome()
try:
driver.get('https://www.taobao.com')
input=driver.find_element(By.ID,'q')
input.send_keys('手机')
# input.clear() #清除
button=driver.find_element(By.CLASS_NAME,'btn-search')
button.click()
driver.close()
except:
driver.close()
'''动作链'''
driver=webdriver.Chrome()
try:
driver.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
driver.switch_to.frame('iframeResult') #切换frame
source=driver.find_element(By.ID,'draggable')
target=driver.find_element(By.ID,'droppable')
actions=ActionChains(driver)
actions.drag_and_drop(source,target)
actions.perform()
except:
driver.close()
'''执行JavaScript,比如下拉进度条'''
driver=webdriver.Chrome()
try:
driver.get('https://www.zhihu.com/explore')
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
driver.execute_script('alert("To Bottom")')
except:
print('error')
driver.close()
'''获取属性'''
driver=webdriver.Chrome()
try:
driver.get('https://www.zhihu.com/explore')
logo=driver.find_element(By.ID,'special')
print(logo)
print(logo.get_attribute('class'))
driver.close()
except:
print('error')
driver.close()
'''获取文本'''
driver=webdriver.Chrome()
try:
driver.get('https://www.zhihu.com/explore')
res=driver.find_element(By.CLASS_NAME,'ExploreSpecialCard-contentTitle')
print(res.text)
driver.close()
except:
print('error')
driver.close()
'''获取id、位置、标签吗和大小'''
driver=webdriver.Chrome()
try:
driver.get('https://www.zhihu.com/explore')
res=driver.find_element(By.CLASS_NAME,'ExploreSpecialCard-contentTitle')
print('id',res.id)
print('位置',res.location)
print('标签',res.tag_name)
print('大小',res.size)
driver.close()
except:
print('error')
driver.close()
'''延时显式等待'''
driver=webdriver.Chrome()
try:
driver.get('https://www.taobao.com')
#引入WebDriverWait对象,设置最长等待时间
wait=WebDriverWait(driver,10)
#调用until方法,传入等待条件;EC.presence_of_element_located代表节点出现
input=wait.until(EC.presence_of_element_located((By.ID,'q')))
button=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.btn-search')))
print(input,button)
driver.close()
except:
print('error')
driver.close()
'''前进和后退'''
driver=webdriver.Chrome()
try:
driver.get('https://www.taobao.com/')
driver.get('https://www.baidu.com/')
driver.get('https://www.python.org/')
driver.back() #后退
time.sleep(1)
driver.forward() #前进
driver.close()
except:
print('error')
driver.close()