正则表达式
import requests
import re
import json
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
"获取html文档"
def Get_HTML(url):
headers={'User-Agent':'Mozilla/5.0'}
try:
r=requests.get(url,headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
print("Error!")
"设置正则表达式规则,获取信息"
def parse_pages(html):
pattern=re.compile('<dl.?cl">.?nob.?>(.?)</b>.?class="li03 oh".?href="(.?)".?class=" pl28">(.?)</a>.?<span>.?title=.?>(.?)</a>.?title=.?>(.?)</a>.?title=.?>(.?)</a>.?class="li05 ta_c".?<span>(.?)</span>.*?</dl>',re.S)
items=re.findall(pattern,html)
for item in items:
yield{
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3:6],
'score':item[6]
}
def main():
url='https://www.1905.com/vod/top/lst/';
html=Get_HTML(url)
for item in parse_pages(html):
print(item)
#Write_to_file(item)
main()
pyquery
from pyquery import PyQuery as pq
import requests
import re
import json
def Get_HTML(url):
headers = {'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("Error!")
def parse_page(html):
doc=pq(html)
title=doc('.li03 a').text().split(' ') #名称
actor=doc('.li04 span').text().split(' ') #演员表
index=doc('.li01 b').text().split(' ') #序号
href=[] #链接
for i in doc('.li03 a').items(): #遍历
href.append(i.attr('href'))
score=doc('.li05 span').text().split(' ') #评分
Result={}
for i in range(100):
result={
'序号':index[i],
'名称':title[i],
'链接':href[i],
'演员':actor[i],
'评分':score[i]
}
Result[index[i]]=result
return Result
def write_to_file(item):
with open('1905电影排行榜.txt', 'a+', encoding='UTF-8') as f:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main():
url = 'https://www.1905.com/vod/top/lst/'
html = Get_HTML(url)
item=parse_page(html)
for i in range(1,len(item)+1):
write_to_file(item[str(i)])
main()