优化日志输出
This commit is contained in:
parent
5c5897d3ab
commit
639371daee
@ -1,19 +1,20 @@
|
|||||||
from dbm import error
|
from dbm import error
|
||||||
from http.client import responses
|
from http.client import responses
|
||||||
|
import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import openpyxl
|
import openpyxl
|
||||||
import ssl
|
import ssl
|
||||||
|
|
||||||
range_start = 57831
|
range_start = 5000
|
||||||
range_end = 70000
|
range_end = 10000
|
||||||
# 创建Excel工作簿和工作表对象
|
# 创建Excel工作簿和工作表对象
|
||||||
workbook = openpyxl.Workbook()
|
workbook = openpyxl.Workbook()
|
||||||
sheet = workbook.active
|
sheet = workbook.active
|
||||||
sheet.title = 'Web Data'
|
sheet.title = 'Web Data'
|
||||||
sheet._current_row = 1
|
sheet._current_row = 0
|
||||||
rownum = 1
|
rownum = 0
|
||||||
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
|
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
|
||||||
|
|
||||||
# 定义起始URL模板和范围(假设是五位数字)
|
# 定义起始URL模板和范围(假设是五位数字)
|
||||||
@ -36,7 +37,7 @@ for num in num_range:
|
|||||||
# print(url)
|
# print(url)
|
||||||
tryCount = 0
|
tryCount = 0
|
||||||
if num % 100 == 0:
|
if num % 100 == 0:
|
||||||
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
|
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
|
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
|
||||||
@ -62,7 +63,8 @@ for num in num_range:
|
|||||||
elif response.status_code == 404:
|
elif response.status_code == 404:
|
||||||
errorCount = errorCount + 1
|
errorCount = errorCount + 1
|
||||||
failCount = failCount+1
|
failCount = failCount+1
|
||||||
print(url, response.status_code, errorCount)
|
if errorCount % 10 == 0:
|
||||||
|
print(url, response.status_code, errorCount)
|
||||||
tryCount = 0
|
tryCount = 0
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -77,7 +79,7 @@ for num in num_range:
|
|||||||
failCount = failCount+1
|
failCount = failCount+1
|
||||||
break
|
break
|
||||||
continue
|
continue
|
||||||
if errorCount >= 100:
|
if errorCount >= 1000:
|
||||||
# 连续10次404
|
# 连续10次404
|
||||||
break
|
break
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user