优化日志输出
This commit is contained in:
parent
5c5897d3ab
commit
639371daee
@ -1,19 +1,20 @@
|
||||
from dbm import error
|
||||
from http.client import responses
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import openpyxl
|
||||
import ssl
|
||||
|
||||
range_start = 57831
|
||||
range_end = 70000
|
||||
range_start = 5000
|
||||
range_end = 10000
|
||||
# 创建Excel工作簿和工作表对象
|
||||
workbook = openpyxl.Workbook()
|
||||
sheet = workbook.active
|
||||
sheet.title = 'Web Data'
|
||||
sheet._current_row = 1
|
||||
rownum = 1
|
||||
sheet._current_row = 0
|
||||
rownum = 0
|
||||
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
|
||||
|
||||
# 定义起始URL模板和范围(假设是五位数字)
|
||||
@ -36,7 +37,7 @@ for num in num_range:
|
||||
# print(url)
|
||||
tryCount = 0
|
||||
if num % 100 == 0:
|
||||
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
|
||||
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
|
||||
@ -62,6 +63,7 @@ for num in num_range:
|
||||
elif response.status_code == 404:
|
||||
errorCount = errorCount + 1
|
||||
failCount = failCount+1
|
||||
if errorCount % 10 == 0:
|
||||
print(url, response.status_code, errorCount)
|
||||
tryCount = 0
|
||||
break
|
||||
@ -77,7 +79,7 @@ for num in num_range:
|
||||
failCount = failCount+1
|
||||
break
|
||||
continue
|
||||
if errorCount >= 100:
|
||||
if errorCount >= 1000:
|
||||
# 连续10次404
|
||||
break
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user