From 639371daee2046c3f0589f669a3886a1536d1454 Mon Sep 17 00:00:00 2001
From: huzhujiang <huzhujiang@foxmail.com>
Date: Tue, 13 May 2025 09:53:38 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=97=A5=E5=BF=97=E8=BE=93?=
 =?UTF-8?q?=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 zsjieSpider.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/zsjieSpider.py b/zsjieSpider.py
index a355ce8..04509be 100644
--- a/zsjieSpider.py
+++ b/zsjieSpider.py
@@ -1,19 +1,20 @@
 from dbm import error
 from http.client import responses
+import datetime
 
 import requests
 from bs4 import BeautifulSoup
 import openpyxl
 import ssl
 
-range_start = 57831
-range_end = 70000
+range_start = 5000
+range_end = 10000
 # 创建Excel工作簿和工作表对象
 workbook = openpyxl.Workbook()
 sheet = workbook.active
 sheet.title = 'Web Data'
-sheet._current_row = 1
-rownum = 1
+sheet._current_row = 0
+rownum = 0
 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
 
 # 定义起始URL模板和范围(假设是五位数字)
@@ -36,7 +37,7 @@ for num in num_range:
     # print(url)
     tryCount = 0
     if num % 100 == 0:
-        print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
+        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
     while True:
         try:
             response = requests.get(url, headers=headers, timeout=30)  # 发起网络请求获取网页内容
@@ -62,7 +63,8 @@ for num in num_range:
             elif response.status_code == 404:
                 errorCount = errorCount + 1
                 failCount = failCount+1
-                print(url, response.status_code, errorCount)
+                if errorCount % 10 == 0:
+                    print(url, response.status_code, errorCount)
                 tryCount = 0
                 break
             else:
@@ -77,7 +79,7 @@ for num in num_range:
                 failCount = failCount+1
                 break
             continue
-    if errorCount >= 100:
+    if errorCount >= 1000:
         # 连续10次404
         break