diff --git a/zsjieSpider.py b/zsjieSpider.py index a355ce8..04509be 100644 --- a/zsjieSpider.py +++ b/zsjieSpider.py @@ -1,19 +1,20 @@ from dbm import error from http.client import responses +import datetime import requests from bs4 import BeautifulSoup import openpyxl import ssl -range_start = 57831 -range_end = 70000 +range_start = 5000 +range_end = 10000 # 创建Excel工作簿和工作表对象 workbook = openpyxl.Workbook() sheet = workbook.active sheet.title = 'Web Data' -sheet._current_row = 1 -rownum = 1 +sheet._current_row = 0 +rownum = 0 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行 # 定义起始URL模板和范围(假设是五位数字) @@ -36,7 +37,7 @@ for num in num_range: # print(url) tryCount = 0 if num % 100 == 0: - print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]") + print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]") while True: try: response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容 @@ -62,7 +63,8 @@ for num in num_range: elif response.status_code == 404: errorCount = errorCount + 1 failCount = failCount+1 - print(url, response.status_code, errorCount) + if errorCount % 10 == 0: + print(url, response.status_code, errorCount) tryCount = 0 break else: @@ -77,7 +79,7 @@ for num in num_range: failCount = failCount+1 break continue - if errorCount >= 100: + if errorCount >= 1000: # 连续10次404 break