zsjie/zsjieSpider.py
2025-04-15 15:02:00 +08:00

56 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from dbm import error
import requests
from bs4 import BeautifulSoup
import openpyxl
import re
range_start = 40000
range_end = 50000
# 创建Excel工作簿和工作表对象
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Web Data'
sheet._current_row = 1
rownum = 1
sheet.append(['id', 'URL', 'Title', 'Tags', 'update_date', 'source', 'password']) # 标题行
# 定义起始URL模板和范围假设是五位数字
url_template = 'https://www.zsjie.com/¥num.html'
num_range = range(range_start, range_end) # 从10000到99999的五个数字组合范围
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
# 循环遍历所有可能的URL并获取数据
for num in num_range:
url = url_template.replace('¥num', str(num)) # 构建URL
print(url)
try:
response = requests.get(url, headers=headers) # 发起网络请求获取网页内容
if response.status_code == 200: # 检查请求是否成功
soup = BeautifulSoup(response.content, 'html.parser') # 解析HTML内容
# print(soup)
# 使用CSS选择器定位目标元素以下CSS选择器需根据实际情况更改
el = soup.select('.entry-title')
if len(el) < 1:
continue
title = soup.select('.entry-title')[0].text.replace('\n', '').replace(' ', '').replace('\'', '')
tags = soup.select('.meta-category > a')[0].text
update_date = soup.select('.meta-date > time')[0].text.replace('\n', '').replace(' ', '').replace('\'',
'')
rownum += 1
sheet._current_row = rownum
sheet.append([num, url, title, tags, update_date])
else:
print(url, response.status_code)
except Exception as e:
print(url)
print(e)
continue
workbook.save('web_data' + str(range_start) + '-' + str(range_end) + '.xlsx') # 请注意更改此路径为你的输出路径及文件名需求。如果存在同名文件将被覆盖。需要小心操作以防止意外覆盖原有文件。 ​​​​​​​最后请确保脚本运行在有足够权限的环境下,并且正确处理了可能出现的异常。注意检查网页结构是否稳定,因为网页结构的变化可能导致脚本失效。同时,请遵守网站的爬虫政策以避免不必要的麻烦。