爬虫爬取数据保存到mysql
warning:
这篇文章距离上次修改已过911天,其中的内容可能已经有所变动。
import pymysql
from pyquery import PyQuery as pq
import requests
def Get_HTML(url):
headers = {'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("Error!")
def parse_page(html):
doc=pq(html)
title=doc('.li03 a').text().split(' ') #名称
actor=doc('.li04 span').text().split(' ') #演员表
index=doc('.li01 b').text().split(' ') #序号
href=[] #链接
for i in doc('.li03 a').items(): #遍历
href.append(i.attr('href'))
score=doc('.li05 span').text().split(' ') #评分
#保存为字典
Result={}
for i in range(100):
result={
'序号':index[i],
'名称':title[i],
'链接':href[i],
'演员':actor[i],
'评分':score[i]
}
Result[index[i]]=result
return Result
def write_to_mysql(dict):
db=pymysql.connect(host='*******',user='root',password='******',passwd='3306',db='*******')
cursor=db.cursor()
table='movie' #表名
keys=','.join(dict.keys())
values=','.join(['%s']*len(dict))
sql='insert into {table}({keys}) values ({values})'.format(table=table,keys=keys,values=values)
try:
if cursor.execute(sql,tuple(dict.values())):
print("successful")
db.commit()
except:
print("failed")
db.rollback()
db.close()
#若主键存在,进行更新操作;否则进行插入数据
def update_data(dict):
db = pymysql.connect(host='*******',user='root',password='******',passwd='3306',db='*******')
cursor=db.cursor()
table='movie' #表名
keys=','.join(dict.keys())
values=','.join(['%s']*len(dict))
sql='insert into {table}({keys}) values ({values}) ON DUPLICATE KEY UPDATE'.format(table=table,keys=keys,values=values)
update=','.join([' {key}=%s'.format(key=key) for key in dict])
sql+=update
try:
if cursor.execute(sql,tuple(dict.values())*2):
print("successful")
db.commit()
except:
print("failed")
db.rollback()
db.close()
#查询
def search():
db = pymysql.connect(host='*******',user='root',password='******',passwd='3306',db='*******')
cursor = db.cursor()
sql = 'select * from movie'
try:
cursor.execute(sql)
print(cursor.rowcount)
row = cursor.fetchone()
while row:
print('Row:', row)
row = cursor.fetchone()
except:
print("Error")
db.close()
def main():
url = 'https://www.1905.com/vod/top/lst/'
html = Get_HTML(url)
item=parse_page(html)
for i in range(len(item)):
write_to_mysql(item[str(i+1)])
main()