pyquery库使用
warning:
这篇文章距离上次修改已过723天,其中的内容可能已经有所变动。
import csv
from pyquery import PyQuery as pq
import requests
import json
def Get_HTML(url):
headers = {'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("Error!")
def parse_page(html):
doc=pq(html)
title=doc('.li03 a').text().split(' ') #名称
actor=doc('.li04 span').text().split(' ') #演员表
index=doc('.li01 b').text().split(' ') #序号
href=[] #链接
for i in doc('.li03 a').items(): #遍历
href.append(i.attr('href'))
score=doc('.li05 span').text().split(' ') #评分
#保存成CSV文件
# with open('1905电影排行榜.csv','a+') as csvfile:
# writer = csv.writer(csvfile,)
# writer.writerow(['序号', '名称', '链接', '演员', '评分'])
# for i in range(len(title)):
# writer.writerow([index[i],title[i],href[i],actor[i],score[i]])
#保存为字典
Result={}
for i in range(100):
result={
'序号':index[i],
'名称':title[i],
'链接':href[i],
'演员':actor[i],
'评分':score[i]
}
Result[index[i]]=result
return Result
def write_to_txtfile(item):
with open('1905电影排行榜.txt', 'a+', encoding='UTF-8') as f:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main():
url = 'https://www.1905.com/vod/top/lst/'
html = Get_HTML(url)
item=parse_page(html)
#字典写成TXT文件
# for i in range(1,len(item)+1):
# write_to_txtfile(item[str(i)])
#字典写成CSV文件
# with open('1905电影排行榜_111.csv','a+',encoding='utf-8') as csvfile:
# fieldnames = ['序号', '名称', '链接', '演员', '评分']
# writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
# writer.writeheader()
# for i in range(len(item)):
# writer.writerow(item[str(i+1)])
main()