优化代码
This commit is contained in:
parent
0641dcecfd
commit
5c5897d3ab
@ -1,12 +1,10 @@
|
||||
import pandas as pd
|
||||
from sqlalchemy import create_engine, text
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import time
|
||||
|
||||
# --- 配置 (与之前相同) ---
|
||||
EXCEL_FILE_PATH = 'Z:\\xiaohu\\web_data40000-50000.xlsx'
|
||||
EXCEL_FILE_PATH = 'X:\\xiaohu\\web_data40000-50000.xlsx'
|
||||
SHEET_NAME = 0
|
||||
DB_USER = 'zsjie'
|
||||
DB_PASSWORD = 'xRekX6Cc3RRK6mBe'
|
||||
|
@ -1,13 +1,13 @@
|
||||
from dbm import error
|
||||
from http.client import responses
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import openpyxl
|
||||
import re
|
||||
import ssl
|
||||
|
||||
|
||||
range_start = 40000
|
||||
range_end = 50000
|
||||
range_start = 57831
|
||||
range_end = 70000
|
||||
# 创建Excel工作簿和工作表对象
|
||||
workbook = openpyxl.Workbook()
|
||||
sheet = workbook.active
|
||||
@ -17,39 +17,71 @@ rownum = 1
|
||||
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
|
||||
|
||||
# 定义起始URL模板和范围(假设是五位数字)
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
url_template = 'https://www.zsjie.com/¥num.html'
|
||||
num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Connection': 'close'}
|
||||
|
||||
errorCount = 0
|
||||
num = 0
|
||||
response = None
|
||||
exceptionId = []
|
||||
lastSucessId = 0
|
||||
sucessCount = 0
|
||||
failCount = 0
|
||||
# 循环遍历所有可能的URL并获取数据
|
||||
for num in num_range:
|
||||
url = url_template.replace('¥num', str(num)) # 构建URL
|
||||
print(url)
|
||||
try:
|
||||
response = requests.get(url, headers=headers) # 发起网络请求获取网页内容
|
||||
if response.status_code == 200: # 检查请求是否成功
|
||||
soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容
|
||||
# print(soup)
|
||||
# 使用CSS选择器定位目标元素,以下CSS选择器需根据实际情况更改
|
||||
el = soup.select('.entry-title')
|
||||
if len(el) < 1:
|
||||
continue
|
||||
title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
|
||||
tags = soup.select('.meta-category > a')[0].text
|
||||
update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
|
||||
'')
|
||||
rownum += 1
|
||||
sheet._current_row = rownum
|
||||
sheet.append([num, url, title, tags, update_date])
|
||||
else:
|
||||
print(url, response.status_code)
|
||||
except Exception as e:
|
||||
print(url)
|
||||
print(e)
|
||||
continue
|
||||
|
||||
workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。
|
||||
|
||||
# print(url)
|
||||
tryCount = 0
|
||||
if num % 100 == 0:
|
||||
print("截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
|
||||
if response.status_code == 200: # 检查请求是否成功
|
||||
errorCount = 0
|
||||
soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容
|
||||
# print(soup)
|
||||
# 使用CSS选择器定位目标元素,以下CSS选择器需根据实际情况更改
|
||||
el = soup.select('.entry-title')
|
||||
if len(el) < 1:
|
||||
continue
|
||||
title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
|
||||
tags = soup.select('.meta-category > a')[0].text
|
||||
update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
|
||||
'')
|
||||
rownum += 1
|
||||
sheet._current_row = rownum
|
||||
sheet.append([num, url, title, tags, update_date])
|
||||
tryCount = 0
|
||||
lastSucessId = num
|
||||
sucessCount = sucessCount+1
|
||||
break
|
||||
elif response.status_code == 404:
|
||||
errorCount = errorCount + 1
|
||||
failCount = failCount+1
|
||||
print(url, response.status_code, errorCount)
|
||||
tryCount = 0
|
||||
break
|
||||
else:
|
||||
tryCount = tryCount + 1
|
||||
print(url, "尝试次数:" + str(tryCount))
|
||||
except Exception as e:
|
||||
print(url, e)
|
||||
tryCount = tryCount + 1
|
||||
print(url, "尝试次数:" + str(tryCount))
|
||||
if tryCount > 10:
|
||||
exceptionId.append(num)
|
||||
failCount = failCount+1
|
||||
break
|
||||
continue
|
||||
if errorCount >= 100:
|
||||
# 连续10次404
|
||||
break
|
||||
|
||||
print(exceptionId)
|
||||
|
||||
workbook.save('web_data' + str(range_start) + '-' + str(lastSucessId)+ '-' + str(num) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。
|
||||
# 最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。
|
||||
|
Loading…
Reference in New Issue
Block a user