优化日志输出

This commit is contained in:
huzhujiang 2025-05-13 09:53:38 +08:00
parent 5c5897d3ab
commit 639371daee

View File

@ -1,19 +1,20 @@
from dbm import error
from http.client import responses
import datetime
import requests
from bs4 import BeautifulSoup
import openpyxl
import ssl
range_start = 57831
range_end = 70000
range_start = 5000
range_end = 10000
# 创建Excel工作簿和工作表对象
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Web Data'
sheet._current_row = 1
rownum = 1
sheet._current_row = 0
rownum = 0
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
# 定义起始URL模板和范围假设是五位数字
@ -36,7 +37,7 @@ for num in num_range:
# print(url)
tryCount = 0
if num % 100 == 0:
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
while True:
try:
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
@ -62,6 +63,7 @@ for num in num_range:
elif response.status_code == 404:
errorCount = errorCount + 1
failCount = failCount+1
if errorCount % 10 == 0:
print(url, response.status_code, errorCount)
tryCount = 0
break
@ -77,7 +79,7 @@ for num in num_range:
failCount = failCount+1
break
continue
if errorCount >= 100:
if errorCount >= 1000:
# 连续10次404
break