【自制实用小工具】——1、Xpath解析器

由于js脚本的影响,我们请求得到的数据常常与网页显示的数据不一样。而chrome插件xpath helper不能调试本地网页,于是有了制造一个xpath解析器的想法。(粗略尝试了一下,没有问题,大家要是发现bug的话记得评论告诉我啊~)
工具:

  1. PyQt5 库
  2. Qt designer
  3. sys 库
  4. requests 库
  5. lxml 库

步骤:

(一)用Qt designer设计界面

界面

(二)将.ui文件转换为.py文件

有关(一)、(二)部分的教程可以参考:https://www.jb51.net/article/170810.htm

(三)链接按钮

将以下代码添加到def setupUi后面

1
2
3
# 设置按钮控件
self.button_Get_html.clicked.connect(self.Button_Get_Html)
self.button_Xpath_Parse.clicked.connect(self.Button_Xpath_Parse)

(四)按钮事件

以下分别是按钮==Get Html==和按钮 ==Xpath Parse== 的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def Button_Get_Html(self):

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36'
}
url = self.text_Web_Site.toPlainText().strip()
if len(url):
if url[0] == 'w':
url = 'http://' + url
session = requests.session()
try:
res = session.get(url=url, headers=headers, verify=False).content.decode('utf-8','ignore')
# 在text_HTML_Code中输出返回内容
self.text_HTML_Code.setPlainText(res)
except Exception as e:
self.text_HTML_Code.setPlainText(e.__str__())
else:
self.text_HTML_Code.setPlainText('网址不能为空!')

def Button_Xpath_Parse(self):
self.text_Result.document().clear()

xpath_syntax=self.text_Xpath_Syntax.toPlainText()
html_code=self.text_HTML_Code.toPlainText()
html=etree.HTML(html_code)
try:
results = html.xpath(xpath_syntax)
num = 0
for result in results:
self.text_Result.append('-'*60+'这里是第 '+str(num)+' 个')
# result 有两种格式
try:
self.text_Result.append(result.text)
except Exception:
self.text_Result.append(result)
num=num+1
except Exception as e:
self.text_Result.setPlainText(e.__str__())

(五)初始化界面

1
2
3
4
5
6
7
8
9
10
11
12
if __name__ == '__main__':
# 每一pyqt5应用程序必须创建一个应用程序对象。sys.argv参数是一个列表,从命令行输入参数。
app = QtWidgets.QApplication(sys.argv)
# QWidget部件是pyqt5所有用户界面对象的基类。他为QWidget提供默认构造函数。默认构造函数没有父类。
w = QtWidgets.QWidget()
ui = Ui_Asyu17_Xpath_Helper()
ui.setupUi(w)
w.show()

# 系统exit()方法确保应用程序干净的退出
# 的exec_()方法有下划线。因为执行是一个Python关键词。因此,exec_()代替
sys.exit(app.exec_())

结果展示:

测试无问题后,可使用pyinstaller将代码编译成可执行文件~
结果展示
代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from PyQt5 import QtCore, QtGui, QtWidgets
import sys
import requests
from lxml import etree

requests.packages.urllib3.disable_warnings()

class Ui_Asyu17_Xpath_Helper(object):
def setupUi(self, Asyu17_Xpath_Helper):
Asyu17_Xpath_Helper.setObjectName("Asyu17_Xpath_Helper")
Asyu17_Xpath_Helper.resize(969, 905)
self.button_Xpath_Parse = QtWidgets.QPushButton(Asyu17_Xpath_Helper)
self.button_Xpath_Parse.setGeometry(QtCore.QRect(830, 860, 75, 31))
self.button_Xpath_Parse.setObjectName("button_Xpath_Parse")
self.label = QtWidgets.QLabel(Asyu17_Xpath_Helper)
self.label.setGeometry(QtCore.QRect(10, 10, 71, 16))
self.label.setFrameShape(QtWidgets.QFrame.StyledPanel)
self.label.setScaledContents(False)
self.label.setObjectName("label")
self.label_2 = QtWidgets.QLabel(Asyu17_Xpath_Helper)
self.label_2.setGeometry(QtCore.QRect(490, 10, 51, 16))
self.label_2.setFrameShape(QtWidgets.QFrame.StyledPanel)
self.label_2.setScaledContents(False)
self.label_2.setObjectName("label_2")
self.label_3 = QtWidgets.QLabel(Asyu17_Xpath_Helper)
self.label_3.setGeometry(QtCore.QRect(20, 860, 91, 31))
self.label_3.setObjectName("label_3")
self.text_Xpath_Syntax = QtWidgets.QTextBrowser(Asyu17_Xpath_Helper)
self.text_Xpath_Syntax.setGeometry(QtCore.QRect(110, 860, 681, 31))
font = QtGui.QFont()
font.setFamily("Arial")
font.setPointSize(13)
self.text_Xpath_Syntax.setFont(font)
self.text_Xpath_Syntax.setReadOnly(False)
self.text_Xpath_Syntax.setObjectName("text_Xpath_Syntax")
self.button_Get_html = QtWidgets.QPushButton(Asyu17_Xpath_Helper)
self.button_Get_html.setGeometry(QtCore.QRect(830, 820, 75, 31))
self.button_Get_html.setObjectName("button_Get_html")
self.text_Web_Site = QtWidgets.QTextBrowser(Asyu17_Xpath_Helper)
self.text_Web_Site.setGeometry(QtCore.QRect(110, 820, 681, 31))
font = QtGui.QFont()
font.setFamily("Arial")
font.setPointSize(13)
self.text_Web_Site.setFont(font)
self.text_Web_Site.setReadOnly(False)
self.text_Web_Site.setObjectName("text_Web_Site")
self.label_4 = QtWidgets.QLabel(Asyu17_Xpath_Helper)
self.label_4.setGeometry(QtCore.QRect(20, 820, 91, 31))
self.label_4.setObjectName("label_4")
self.layoutWidget = QtWidgets.QWidget(Asyu17_Xpath_Helper)
self.layoutWidget.setGeometry(QtCore.QRect(10, 30, 951, 781))
self.layoutWidget.setObjectName("layoutWidget")
self.horizontalLayout = QtWidgets.QHBoxLayout(self.layoutWidget)
self.horizontalLayout.setContentsMargins(0, 0, 0, 0)
self.horizontalLayout.setObjectName("horizontalLayout")
self.text_HTML_Code = QtWidgets.QTextBrowser(self.layoutWidget)
self.text_HTML_Code.setEnabled(True)
font = QtGui.QFont()
font.setFamily("Arial")
font.setPointSize(12)
self.text_HTML_Code.setFont(font)
self.text_HTML_Code.setMouseTracking(False)
self.text_HTML_Code.setTabletTracking(False)
self.text_HTML_Code.setReadOnly(False)

self.text_HTML_Code.setObjectName("text_HTML_Code")
self.horizontalLayout.addWidget(self.text_HTML_Code)
self.text_Result = QtWidgets.QTextBrowser(self.layoutWidget)
font = QtGui.QFont()
font.setFamily("Arial")
font.setPointSize(12)
self.text_Result.setFont(font)
self.text_Result.setReadOnly(False)

self.horizontalLayout.addWidget(self.text_Result)

self.retranslateUi(Asyu17_Xpath_Helper)
QtCore.QMetaObject.connectSlotsByName(Asyu17_Xpath_Helper)

# 设置按钮控件
self.button_Get_html.clicked.connect(self.Button_Get_Html)
self.button_Xpath_Parse.clicked.connect(self.Button_Xpath_Parse)

def retranslateUi(self, Asyu17_Xpath_Helper):
_translate = QtCore.QCoreApplication.translate
Asyu17_Xpath_Helper.setWindowTitle(_translate("Asyu17_Xpath_Helper", "Asyu17 Xpath Helper"))
self.button_Xpath_Parse.setText(_translate("Asyu17_Xpath_Helper", "Xpath Parse"))
self.label.setText(_translate("Asyu17_Xpath_Helper", "HTML Code:"))
self.label_2.setText(_translate("Asyu17_Xpath_Helper", "Result:"))
self.label_3.setText(_translate("Asyu17_Xpath_Helper", "Xpath Syntax:"))
self.button_Get_html.setText(_translate("Asyu17_Xpath_Helper", "Get Html"))
self.label_4.setText(_translate("Asyu17_Xpath_Helper", "Web Site:"))

def Button_Get_Html(self):

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36'
}
url = self.text_Web_Site.toPlainText().strip()
if len(url):
if url[0] == 'w':
url = 'http://' + url
session = requests.session()
try:
res = session.get(url=url, headers=headers, verify=False).content.decode('utf-8','ignore')
# 在text_HTML_Code中输出返回内容
self.text_HTML_Code.setPlainText(res)
except Exception as e:
self.text_HTML_Code.setPlainText(e.__str__())
else:
self.text_HTML_Code.setPlainText('网址不能为空!')

def Button_Xpath_Parse(self):
self.text_Result.document().clear()

xpath_syntax=self.text_Xpath_Syntax.toPlainText()
html_code=self.text_HTML_Code.toPlainText()
html=etree.HTML(html_code)
try:
results = html.xpath(xpath_syntax)
num = 0
for result in results:
self.text_Result.append('-'*60+'这里是第 '+str(num)+' 个')
# result 有两种格式
try:
self.text_Result.append(result.text)
except Exception:
self.text_Result.append(result)
num=num+1
except Exception as e:
self.text_Result.setPlainText(e.__str__())

if __name__ == '__main__':
# 每一pyqt5应用程序必须创建一个应用程序对象。sys.argv参数是一个列表,从命令行输入参数。
app = QtWidgets.QApplication(sys.argv)
# QWidget部件是pyqt5所有用户界面对象的基类。他为QWidget提供默认构造函数。默认构造函数没有父类。
w = QtWidgets.QWidget()
ui = Ui_Asyu17_Xpath_Helper()
ui.setupUi(w)
w.show()

# 系统exit()方法确保应用程序干净的退出
# 的exec_()方法有下划线。因为执行是一个Python关键词。因此,exec_()代替
sys.exit(app.exec_())

==微信公众号:==

小术快跑