zsjie/zsjieSpider.py
2025-05-13 09:53:38 +08:00

90 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from dbm import error
from http.client import responses
import datetime
import requests
from bs4 import BeautifulSoup
import openpyxl
import ssl
range_start = 5000
range_end = 10000
# 创建Excel工作簿和工作表对象
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Web Data'
sheet._current_row = 0
rownum = 0
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
# 定义起始URL模板和范围假设是五位数字
ssl._create_default_https_context = ssl._create_unverified_context
url_template = 'https://www.zsjie.com/¥num.html'
num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",'Connection': 'close'}
errorCount = 0
num = 0
response = None
exceptionId = []
lastSucessId = 0
sucessCount = 0
failCount = 0
# 循环遍历所有可能的URL并获取数据
for num in num_range:
url = url_template.replace('¥num', str(num)) # 构建URL
# print(url)
tryCount = 0
if num % 100 == 0:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "截至["+str(num)+"],成功录入["+str(sucessCount)+"]失败的数量["+str(failCount)+"]")
while True:
try:
response = requests.get(url, headers=headers, timeout=30) # 发起网络请求获取网页内容
if response.status_code == 200: # 检查请求是否成功
errorCount = 0
soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容
# print(soup)
# 使用CSS选择器定位目标元素以下CSS选择器需根据实际情况更改
el = soup.select('.entry-title')
if len(el) < 1:
continue
title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
tags = soup.select('.meta-category > a')[0].text
update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
'')
rownum += 1
sheet._current_row = rownum
sheet.append([num, url, title, tags, update_date])
tryCount = 0
lastSucessId = num
sucessCount = sucessCount+1
break
elif response.status_code == 404:
errorCount = errorCount + 1
failCount = failCount+1
if errorCount % 10 == 0:
print(url, response.status_code, errorCount)
tryCount = 0
break
else:
tryCount = tryCount + 1
print(url, "尝试次数:" + str(tryCount))
except Exception as e:
print(url, e)
tryCount = tryCount + 1
print(url, "尝试次数:" + str(tryCount))
if tryCount > 10:
exceptionId.append(num)
failCount = failCount+1
break
continue
if errorCount >= 1000:
# 连续10次404
break
print(exceptionId)
workbook.save('web_data' + str(range_start) + '-' + str(lastSucessId)+ '-' + str(num) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。
# 最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。