bs4模块

bs4模块我们主要使用的就是他其中的BeautilSoup类去用来帮助我们做到解析页面

bs4模块的导入

1	from bs4 import BeautifulSoup

BeautifulSoup的使用

#获得一个bs对象
bs = BeautifulSoup('爬取到的页面信息','使用什么解释器去解析')
#eg：BeautifulSoup(response.text,'lxml') lxml是一种解释器

#bs方法
bs.标签名称 返回文档中第一次出现的与标签名称对应的标签
bs.find(标签名称) 返回第一次与标签名称对应的标签
bs.find_all(标签名称，属性名称) 返回与标签名称对应的所有标签列表
#eg:bs.find('div',class_/id/attr='')

select
bs.select('选择器')，返回的是一个列表
标签.get_text() 获得标签对应的文本
#eg:bs.select('.tang') 获得class=.tang的标签列表
#eg:bs.select('.tang')[0].get_text() 获得对应标签的文本
标签[属性] #可以获得对应标签的属性
#eg:bs.select('.tang')[0]['href'] 获得对应标签的href

案例：三国演义爬取

import requests
import os
from bs4 import BeautifulSoup

url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
response = requests.get(url,headers=headers)
print(response.status_code)
bs = BeautifulSoup(response.text,'lxml')
sg_dict={}
for a in bs.select('.book-mulu a'):
    sg_dict[a.text] = a['href']

print(sg_dict)

for name,url in sg_dict.items():
    print(name)
    if not os.path.isdir('sg'):
        os.mkdir('sg')
    else:
        base_url = 'https://www.shicimingju.com'
        sure_url = base_url + url
        bs2 = BeautifulSoup(requests.get(sure_url,headers=headers).text,'lxml')
        print(bs2.title)
        print('.bookmark-list')
        # with open(f'sg//{name}.txt','w',encoding='utf-8') as f:
        #     f.write(bs2.title)
        #     f.write(bs2.select('.bookmark-list')[0].text)