pyquery库使用

import csv
from pyquery import PyQuery as pq
import requests
import json

def Get_HTML(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("Error!")

def parse_page(html):
    doc=pq(html)
    title=doc('.li03 a').text().split(' ')  #名称
    actor=doc('.li04 span').text().split(' ')  #演员表
    index=doc('.li01 b').text().split(' ')     #序号
    href=[]  #链接
    for i in doc('.li03 a').items():  #遍历
        href.append(i.attr('href'))
    score=doc('.li05 span').text().split(' ')  #评分
#保存成CSV文件
    # with open('1905电影排行榜.csv','a+') as csvfile:
    #     writer = csv.writer(csvfile,)
    #     writer.writerow(['序号', '名称', '链接', '演员', '评分'])
    #     for i in range(len(title)):
    #         writer.writerow([index[i],title[i],href[i],actor[i],score[i]])
#保存为字典
    Result={}
    for i in range(100):
        result={
            '序号':index[i],
            '名称':title[i],
            '链接':href[i],
            '演员':actor[i],
            '评分':score[i]
        }
        Result[index[i]]=result
    return Result
def write_to_txtfile(item):
    with open('1905电影排行榜.txt', 'a+', encoding='UTF-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main():
    url = 'https://www.1905.com/vod/top/lst/'
    html = Get_HTML(url)
    item=parse_page(html)
#字典写成TXT文件
    # for i in range(1,len(item)+1):
        # write_to_txtfile(item[str(i)])
#字典写成CSV文件
    # with open('1905电影排行榜_111.csv','a+',encoding='utf-8') as csvfile:
    #     fieldnames = ['序号', '名称', '链接', '演员', '评分']
    #     writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
    #     writer.writeheader()
    #     for i in range(len(item)):
    #         writer.writerow(item[str(i+1)])

main()

re和pyquery练习(爬取1905电影网)

正则表达式
import requests
import re
import json
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
"获取html文档"
def Get_HTML(url):
headers={'User-Agent':'Mozilla/5.0'}
try:
    r=requests.get(url,headers=headers)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    return r.text
except:
    print("Error!")
"设置正则表达式规则,获取信息"
def parse_pages(html):
pattern=re.compile('<dl.?cl">.?nob.?>(.?)</b>.?class="li03 oh".?href="(.?)".?class=" pl28">(.?)</a>.?<span>.?title=.?>(.?)</a>.?title=.?>(.?)</a>.?title=.?>(.?)</a>.?class="li05 ta_c".?<span>(.?)</span>.*?</dl>',re.S)
items=re.findall(pattern,html)
for item in items:
    yield{
        'index':item[0],
        'image':item[1],
        'title':item[2],
        'actor':item[3:6],
        'score':item[6]
        } 
def main():
url='https://www.1905.com/vod/top/lst/';
html=Get_HTML(url)
for item in parse_pages(html):
    print(item)
    #Write_to_file(item) 
main()

pyquery
from pyquery import PyQuery as pq
import requests
import re
import json

def Get_HTML(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("Error!")

def parse_page(html):
    doc=pq(html)
    title=doc('.li03 a').text().split(' ')  #名称
    actor=doc('.li04 span').text().split(' ')  #演员表
    index=doc('.li01 b').text().split(' ')     #序号
    href=[]  #链接
    for i in doc('.li03 a').items():  #遍历
        href.append(i.attr('href'))
    score=doc('.li05 span').text().split(' ')  #评分
    Result={}
    for i in range(100):
        result={
            '序号':index[i],
            '名称':title[i],
            '链接':href[i],
            '演员':actor[i],
            '评分':score[i]
        }
        Result[index[i]]=result
    return Result
def write_to_file(item):
    with open('1905电影排行榜.txt', 'a+', encoding='UTF-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main():
    url = 'https://www.1905.com/vod/top/lst/'
    html = Get_HTML(url)
    item=parse_page(html)
    for i in range(1,len(item)+1):
        write_to_file(item[str(i)])

main()