xpath模块

使用方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
模块下载
pip install lxml

模块导入
from lxml import etree
1.创建一个etree对象

#1.加载本地的文本,并获得一个对象
tree = etree.parse(filepath)

#2.加载网络上获取的html内容,并获得一个对象
tree = etree.HTML('从网络上获取的html内容')

2.通过xpath方法获得我们需要的内容

#通过xpath获得需要的标签
tree.xpath('标签')

#通过xpath获得需要的标签内的属性 ,一个/代表一个层级,开头的//一个代表根目录,一个代表所有内容
tree.xpath('//标签[@属性 = "属性的内容"]')
eg:tree.xpath('//div[@class="box1"]')返回所有的class = box1的div对象列表
对标签进行层级查找
tree.xpath('//标签[@属性 = "属性的内容"]/p[3]')返回所有的class = box1的div对象内的所有p标签,并找出第三个p标签对象

#返回文本 在需要返回文本的标签后加text() 注意返回的是列表
xpath('//p/text()')

#取得属性 /@属性名
eg:xpath('//div/img/@src') 获得div下的img标签的src

解析本地html报错

lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: link line 45 and head, line 89, column 8

1
2
3
4
#自己创建解析器
parser = etree.HTMLParser(encoding="utf-8")

tree = etree.parse('practice.html',parser=parser)

案例一:58同城二手房源爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
from lxml import etree

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'

}
page_text = requests.get('https://sh.58.com/ershoufang/?utm_source=market&spm=u-2d2yxv86y3v43nkddh1.BDPCPZ_BT&PGTID=0d100000-0000-2756-e0ba-4be2528ff316&ClickID=2',headers=headers).text


print(page_text)
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
for li in li_list:
text = li.xpath('./div[2]/h2/a/text()')
print(text)
print('over')

爬虫爬取数据时中文乱码

1
2
3
4
5
6
解决方案一:响应的时候对数据进行编码
response = requests.get('http://pic.netbian.com/4kdongman/',headers=headers)
# response.encoding = 'utf-8'
解决方案二:对乱码的数据进行解码在进行编码
title = li.xpath('./a/img/@alt')[0]
title = title.encode('iso-8859-1').decode('gbk')

案例二:爬取网站图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os
import requests
from lxml import etree

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'

}
response = requests.get('http://pic.netbian.com/4kdongman/',headers=headers)
# response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)

li_list = tree.xpath('//div[@class="slist"]/ul/li')
print(li_list)
url = 'http://pic.netbian.com'
for li in li_list:
img_src = li.xpath('./a/img/@src')[0]
title = li.xpath('./a/img/@alt')[0]
title = title.encode('iso-8859-1').decode('gbk')

if not os.path.isdir('动漫'):
os.mkdir('动漫')

img_src_sure = url + img_src
print(f'{title}爬取完成')

img = requests.get(img_src_sure,headers=headers).content
with open(f'动漫/{title}.jpg','wb') as f:
f.write(img)