xpath模块

使用方法

模块下载
pip install lxml

模块导入
from lxml import etree
1.创建一个etree对象
	
    #1.加载本地的文本,并获得一个对象
		tree = etree.parse(filepath)  
	
    #2.加载网络上获取的html内容，并获得一个对象
    	tree = etree.HTML('从网络上获取的html内容')

2.通过xpath方法获得我们需要的内容
	
    #通过xpath获得需要的标签
    tree.xpath('标签')
    
    #通过xpath获得需要的标签内的属性 ，一个/代表一个层级，开头的//一个代表根目录，一个代表所有内容
    tree.xpath('//标签[@属性 = "属性的内容"]')
    eg：tree.xpath('//div[@class="box1"]')返回所有的class = box1的div对象列表
    对标签进行层级查找
    tree.xpath('//标签[@属性 = "属性的内容"]/p[3]')返回所有的class = box1的div对象内的所有p标签，并找出第三个p标签对象
    
    #返回文本 在需要返回文本的标签后加text() 注意返回的是列表
    xpath('//p/text()')
    
    #取得属性 /@属性名
    eg:xpath('//div/img/@src') 获得div下的img标签的src

解析本地html报错

lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: link line 45 and head, line 89, column 8

#自己创建解析器
parser = etree.HTMLParser(encoding="utf-8")

tree = etree.parse('practice.html',parser=parser)

案例一：58同城二手房源爬取

import requests
from lxml import etree

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'

}
page_text = requests.get('https://sh.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0000-2756-e0ba-4be2528ff316&ClickID=2',headers=headers).text


print(page_text)
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
for li in li_list:
    text = li.xpath('./div[2]/h2/a/text()')
    print(text)
print('over')

爬虫爬取数据时中文乱码

解决方案一:响应的时候对数据进行编码
    response = requests.get('http://pic.netbian.com/4kdongman/',headers=headers)
	# response.encoding = 'utf-8'
解决方案二:对乱码的数据进行解码在进行编码
    title = li.xpath('./a/img/@alt')[0]
    title = title.encode('iso-8859-1').decode('gbk')

案例二：爬取网站图片

import os
import requests
from lxml import etree

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'

}
response = requests.get('http://pic.netbian.com/4kdongman/',headers=headers)
# response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)

li_list = tree.xpath('//div[@class="slist"]/ul/li')
print(li_list)
url = 'http://pic.netbian.com'
for li in li_list:
    img_src = li.xpath('./a/img/@src')[0]
    title = li.xpath('./a/img/@alt')[0]
    title = title.encode('iso-8859-1').decode('gbk')

    if not os.path.isdir('动漫'):
        os.mkdir('动漫')

    img_src_sure = url + img_src
    print(f'{title}爬取完成')

    img = requests.get(img_src_sure,headers=headers).content
    with open(f'动漫/{title}.jpg','wb') as f:
        f.write(img)