diff --git a/ExcelToMysql.py b/ExcelToMysql.py index 7599b7e..804ea63 100644 --- a/ExcelToMysql.py +++ b/ExcelToMysql.py @@ -1,12 +1,10 @@ import pandas as pd from sqlalchemy import create_engine, text import logging -import math -import re import time # --- 配置 (与之前相同) --- -EXCEL_FILE_PATH = 'Z:\\xiaohu\\web_data40000-50000.xlsx' +EXCEL_FILE_PATH = 'X:\\xiaohu\\web_data40000-50000.xlsx' SHEET_NAME = 0 DB_USER = 'zsjie' DB_PASSWORD = 'xRekX6Cc3RRK6mBe' diff --git a/zsjieSpider.py b/zsjieSpider.py index 34fcf46..a355ce8 100644 --- a/zsjieSpider.py +++ b/zsjieSpider.py @@ -1,13 +1,13 @@ from dbm import error +from http.client import responses import requests from bs4 import BeautifulSoup import openpyxl -import re +import ssl - -range_start = 40000 -range_end = 50000 +range_start = 57831 +range_end = 70000 # 创建Excel工作簿和工作表对象 workbook = openpyxl.Workbook() sheet = workbook.active @@ -17,39 +17,71 @@ rownum = 1 sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行 # 定义起始URL模板和范围(假设是五位数字) +ssl._create_default_https_context = ssl._create_unverified_context url_template = 'https://www.zsjie.com/¥num.html' num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围 -headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} +headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Connection': 'close'} +errorCount = 0 +num = 0 +response = None +exceptionId = [] +lastSucessId = 0 +sucessCount = 0 +failCount = 0 # 循环遍历所有可能的URL并获取数据 for num in num_range: url = url_template.replace('¥num', str(num)) # 构建URL - print(url) - try: - response = requests.get(url, headers=headers) # 发起网络请求获取网页内容 - if response.status_code == 200: # 检查请求是否成功 - soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容 - # print(soup) - # 使用CSS选择器定位目标元素,以下CSS选择器需根据实际情况更改 - el = soup.select('.entry-title') - if len(el) < 1: - continue - title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '') - tags = soup.select('.meta-category > a')[0].text - update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'', - '') - rownum += 1 - sheet._current_row = rownum - sheet.append([num, url, title, tags, update_date]) - else: - print(url, response.status_code) - except Exception as e: - print(url) - print(e) - continue - -workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 ​​​​​​​最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。 - + # print(url) + tryCount = 0 + if num % 100 == 0: + print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]") + while True: + try: + response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容 + if response.status_code == 200: # 检查请求是否成功 + errorCount = 0 + soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容 + # print(soup) + # 使用CSS选择器定位目标元素,以下CSS选择器需根据实际情况更改 + el = soup.select('.entry-title') + if len(el) < 1: + continue + title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '') + tags = soup.select('.meta-category > a')[0].text + update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'', + '') + rownum += 1 + sheet._current_row = rownum + sheet.append([num, url, title, tags, update_date]) + tryCount = 0 + lastSucessId = num + sucessCount = sucessCount+1 + break + elif response.status_code == 404: + errorCount = errorCount + 1 + failCount = failCount+1 + print(url, response.status_code, errorCount) + tryCount = 0 + break + else: + tryCount = tryCount + 1 + print(url, "尝试次数:" + str(tryCount)) + except Exception as e: + print(url, e) + tryCount = tryCount + 1 + print(url, "尝试次数:" + str(tryCount)) + if tryCount > 10: + exceptionId.append(num) + failCount = failCount+1 + break + continue + if errorCount >= 100: + # 连续10次404 + break +print(exceptionId) +workbook.save('web_data' + str(range_start) + '-' + str(lastSucessId)+ '-' + str(num) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 +# 最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。