优化代码

This commit is contained in:
huzhujiang 2025-05-12 15:20:55 +08:00
parent 0641dcecfd
commit 5c5897d3ab
2 changed files with 64 additions and 34 deletions

View File

@ -1,12 +1,10 @@
import pandas as pd import pandas as pd
from sqlalchemy import create_engine, text from sqlalchemy import create_engine, text
import logging import logging
import math
import re
import time import time
# --- 配置 (与之前相同) --- # --- 配置 (与之前相同) ---
EXCEL_FILE_PATH = 'Z:\\xiaohu\\web_data40000-50000.xlsx' EXCEL_FILE_PATH = 'X:\\xiaohu\\web_data40000-50000.xlsx'
SHEET_NAME = 0 SHEET_NAME = 0
DB_USER = 'zsjie' DB_USER = 'zsjie'
DB_PASSWORD = 'xRekX6Cc3RRK6mBe' DB_PASSWORD = 'xRekX6Cc3RRK6mBe'

View File

@ -1,13 +1,13 @@
from dbm import error from dbm import error
from http.client import responses
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import openpyxl import openpyxl
import re import ssl
range_start = 57831
range_start = 40000 range_end = 70000
range_end = 50000
# 创建Excel工作簿和工作表对象 # 创建Excel工作簿和工作表对象
workbook = openpyxl.Workbook() workbook = openpyxl.Workbook()
sheet = workbook.active sheet = workbook.active
@ -17,18 +17,31 @@ rownum = 1
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
# 定义起始URL模板和范围假设是五位数字 # 定义起始URL模板和范围假设是五位数字
ssl._create_default_https_context = ssl._create_unverified_context
url_template = 'https://www.zsjie.com/¥num.html' url_template = 'https://www.zsjie.com/¥num.html'
num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围 num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Connection': 'close'}
errorCount = 0
num = 0
response = None
exceptionId = []
lastSucessId = 0
sucessCount = 0
failCount = 0
# 循环遍历所有可能的URL并获取数据 # 循环遍历所有可能的URL并获取数据
for num in num_range: for num in num_range:
url = url_template.replace('¥num', str(num)) # 构建URL url = url_template.replace('¥num', str(num)) # 构建URL
print(url) # print(url)
tryCount = 0
if num % 100 == 0:
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
while True:
try: try:
response = requests.get(url, headers=headers) # 发起网络请求获取网页内容 response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
if response.status_code == 200: # 检查请求是否成功 if response.status_code == 200: # 检查请求是否成功
errorCount = 0
soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容 soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容
# print(soup) # print(soup)
# 使用CSS选择器定位目标元素以下CSS选择器需根据实际情况更改 # 使用CSS选择器定位目标元素以下CSS选择器需根据实际情况更改
@ -42,14 +55,33 @@ for num in num_range:
rownum += 1 rownum += 1
sheet._current_row = rownum sheet._current_row = rownum
sheet.append([num, url, title, tags, update_date]) sheet.append([num, url, title, tags, update_date])
tryCount = 0
lastSucessId = num
sucessCount = sucessCount+1
break
elif response.status_code == 404:
errorCount = errorCount + 1
failCount = failCount+1
print(url, response.status_code, errorCount)
tryCount = 0
break
else: else:
print(url, response.status_code) tryCount = tryCount + 1
print(url, "尝试次数:" + str(tryCount))
except Exception as e: except Exception as e:
print(url) print(url, e)
print(e) tryCount = tryCount + 1
print(url, "尝试次数:" + str(tryCount))
if tryCount > 10:
exceptionId.append(num)
failCount = failCount+1
break
continue continue
if errorCount >= 100:
# 连续10次404
break
workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 ​​​​​​​最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。 print(exceptionId)
workbook.save('web_data' + str(range_start) + '-' + str(lastSucessId)+ '-' + str(num) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。
# 最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。