优化代码

2025-05-12 15:20:55 +08:00 · 2025-05-12 15:20:55 +08:00 · 5c5897d3ab
commit 5c5897d3ab
parent 0641dcecfd
2 changed files with 64 additions and 34 deletions
--- a/ExcelToMysql.py
+++ b/ExcelToMysql.py
@ -1,12 +1,10 @@
 import pandas as pd
 from sqlalchemy import create_engine, text
 import logging
-import math
-import re
 import time

 # --- 配置 (与之前相同) ---
-EXCEL_FILE_PATH = 'Z:\\xiaohu\\web_data40000-50000.xlsx'
+EXCEL_FILE_PATH = 'X:\\xiaohu\\web_data40000-50000.xlsx'
 SHEET_NAME = 0
 DB_USER = 'zsjie'
 DB_PASSWORD = 'xRekX6Cc3RRK6mBe'
--- a/zsjieSpider.py
+++ b/zsjieSpider.py
@ -1,13 +1,13 @@
 from dbm import error
+from http.client import responses

 import requests
 from bs4 import BeautifulSoup
 import openpyxl
-import re
+import ssl

-
-range_start = 40000
-range_end = 50000
+range_start = 57831
+range_end = 70000
 # 创建Excel工作簿和工作表对象
 workbook = openpyxl.Workbook()
 sheet = workbook.active
@ -17,39 +17,71 @@ rownum = 1
 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行

 # 定义起始URL模板和范围（假设是五位数字）
+ssl._create_default_https_context = ssl._create_unverified_context
 url_template = 'https://www.zsjie.com/¥num.html'
 num_range = range(range_start, range_end)  # 从10000到99999的五个数字组合范围

-headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
+headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Connection': 'close'}

+errorCount = 0
+num = 0
+response = None
+exceptionId = []
+lastSucessId = 0
+sucessCount = 0
+failCount = 0
 # 循环遍历所有可能的URL并获取数据
 for num in num_range:
    url = url_template.replace('¥num', str(num))  # 构建URL
-    print(url)
-    try:
-        response = requests.get(url, headers=headers)  # 发起网络请求获取网页内容
-        if response.status_code == 200:  # 检查请求是否成功
-            soup = BeautifulSoup(response.content, 'html.parser')  # 解析HTML内容
-            # print(soup)
-            # 使用CSS选择器定位目标元素，以下CSS选择器需根据实际情况更改
-            el = soup.select('.entry-title')
-            if len(el) < 1:
-                continue
-            title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
-            tags = soup.select('.meta-category > a')[0].text
-            update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
-                                                                                                              '')
-            rownum += 1
-            sheet._current_row = rownum
-            sheet.append([num, url, title, tags, update_date])
-        else:
-            print(url, response.status_code)
-    except Exception as e:
-        print(url)
-        print(e)
-        continue
-
-workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx')  # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 最后请确保脚本运行在有足够权限的环境下，并且正确处理了可能出现的异常。注意检查网页结构是否稳定，因为网页结构的变化可能导致脚本失效。同时，请遵守网站的爬虫政策以避免不必要的麻烦。
-
+    # print(url)
+    tryCount = 0
+    if num % 100 == 0:
+        print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
+    while True:
+        try:
+            response = requests.get(url, headers=headers, timeout=30)  # 发起网络请求获取网页内容
+            if response.status_code == 200:  # 检查请求是否成功
+                errorCount = 0
+                soup = BeautifulSoup(response.content, 'html.parser')  # 解析HTML内容
+                # print(soup)
+                # 使用CSS选择器定位目标元素，以下CSS选择器需根据实际情况更改
+                el = soup.select('.entry-title')
+                if len(el) < 1:
+                    continue
+                title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
+                tags = soup.select('.meta-category > a')[0].text
+                update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
+                                                                                                                  '')
+                rownum += 1
+                sheet._current_row = rownum
+                sheet.append([num, url, title, tags, update_date])
+                tryCount = 0
+                lastSucessId = num
+                sucessCount = sucessCount+1
+                break
+            elif response.status_code == 404:
+                errorCount = errorCount + 1
+                failCount = failCount+1
+                print(url, response.status_code, errorCount)
+                tryCount = 0
+                break
+            else:
+                tryCount = tryCount + 1
+                print(url, "尝试次数：" + str(tryCount))
+        except Exception as e:
+            print(url, e)
+            tryCount = tryCount + 1
+            print(url, "尝试次数：" + str(tryCount))
+            if tryCount > 10:
+                exceptionId.append(num)
+                failCount = failCount+1
+                break
+            continue
+    if errorCount >= 100:
+        # 连续10次404
+        break

+print(exceptionId)

+workbook.save('web_data' + str(range_start) + '-' + str(lastSucessId)+ '-' + str(num) + '.xlsx')  # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。
+# 最后请确保脚本运行在有足够权限的环境下，并且正确处理了可能出现的异常。注意检查网页结构是否稳定，因为网页结构的变化可能导致脚本失效。同时，请遵守网站的爬虫政策以避免不必要的麻烦。