From dc908089fc70ec4a457aaeca8d4fbb06bf71ae4e Mon Sep 17 00:00:00 2001
From: huzhujiang <huzhujiang@foxmail.com>
Date: Tue, 15 Apr 2025 15:01:40 +0800
Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 zsjieSpider.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 zsjieSpider.py

diff --git a/zsjieSpider.py b/zsjieSpider.py
new file mode 100644
index 0000000..34fcf46
--- /dev/null
+++ b/zsjieSpider.py
@@ -0,0 +1,55 @@
+from dbm import error
+
+import requests
+from bs4 import BeautifulSoup
+import openpyxl
+import re
+
+
+range_start = 40000
+range_end = 50000
+# 创建Excel工作簿和工作表对象
+workbook = openpyxl.Workbook()
+sheet = workbook.active
+sheet.title = 'Web Data'
+sheet._current_row = 1
+rownum = 1
+sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
+
+# 定义起始URL模板和范围（假设是五位数字）
+url_template = 'https://www.zsjie.com/¥num.html'
+num_range = range(range_start, range_end)  # 从10000到99999的五个数字组合范围
+
+headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
+
+# 循环遍历所有可能的URL并获取数据
+for num in num_range:
+    url = url_template.replace('¥num', str(num))  # 构建URL
+    print(url)
+    try:
+        response = requests.get(url, headers=headers)  # 发起网络请求获取网页内容
+        if response.status_code == 200:  # 检查请求是否成功
+            soup = BeautifulSoup(response.content, 'html.parser')  # 解析HTML内容
+            # print(soup)
+            # 使用CSS选择器定位目标元素，以下CSS选择器需根据实际情况更改
+            el = soup.select('.entry-title')
+            if len(el) < 1:
+                continue
+            title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
+            tags = soup.select('.meta-category > a')[0].text
+            update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
+                                                                                                              '')
+            rownum += 1
+            sheet._current_row = rownum
+            sheet.append([num, url, title, tags, update_date])
+        else:
+            print(url, response.status_code)
+    except Exception as e:
+        print(url)
+        print(e)
+        continue
+
+workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx')  # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 ​​​​​​​最后请确保脚本运行在有足够权限的环境下，并且正确处理了可能出现的异常。注意检查网页结构是否稳定，因为网页结构的变化可能导致脚本失效。同时，请遵守网站的爬虫政策以避免不必要的麻烦。
+
+
+