优化日志输出

This commit is contained in:
huzhujiang 2025-05-13 09:53:38 +08:00
parent 5c5897d3ab
commit 639371daee

View File

@ -1,19 +1,20 @@
from dbm import error from dbm import error
from http.client import responses from http.client import responses
import datetime
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import openpyxl import openpyxl
import ssl import ssl
range_start = 57831 range_start = 5000
range_end = 70000 range_end = 10000
# 创建Excel工作簿和工作表对象 # 创建Excel工作簿和工作表对象
workbook = openpyxl.Workbook() workbook = openpyxl.Workbook()
sheet = workbook.active sheet = workbook.active
sheet.title = 'Web Data' sheet.title = 'Web Data'
sheet._current_row = 1 sheet._current_row = 0
rownum = 1 rownum = 0
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
# 定义起始URL模板和范围假设是五位数字 # 定义起始URL模板和范围假设是五位数字
@ -36,7 +37,7 @@ for num in num_range:
# print(url) # print(url)
tryCount = 0 tryCount = 0
if num % 100 == 0: if num % 100 == 0:
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]") print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
while True: while True:
try: try:
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容 response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
@ -62,7 +63,8 @@ for num in num_range:
elif response.status_code == 404: elif response.status_code == 404:
errorCount = errorCount + 1 errorCount = errorCount + 1
failCount = failCount+1 failCount = failCount+1
print(url, response.status_code, errorCount) if errorCount % 10 == 0:
print(url, response.status_code, errorCount)
tryCount = 0 tryCount = 0
break break
else: else:
@ -77,7 +79,7 @@ for num in num_range:
failCount = failCount+1 failCount = failCount+1
break break
continue continue
if errorCount >= 100: if errorCount >= 1000:
# 连续10次404 # 连续10次404
break break