From 590f7ba6a1a1e4f64975f77dc1d7181a07ef9791 Mon Sep 17 00:00:00 2001 From: huzhujiang Date: Thu, 12 Jun 2025 10:17:46 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Eexcel=E8=BD=ACta=E7=94=B3?= =?UTF-8?q?=E8=AF=B7=E6=96=87=E4=BB=B6=E7=9A=84=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GenSql.py | 289 +++++++++++++++++++++++++++++++++++++++++ zsjieSpiderComplete.py | 112 ++++++++++++++++ 2 files changed, 401 insertions(+) create mode 100644 GenSql.py create mode 100644 zsjieSpiderComplete.py diff --git a/GenSql.py b/GenSql.py new file mode 100644 index 0000000..ad22c49 --- /dev/null +++ b/GenSql.py @@ -0,0 +1,289 @@ +import pandas as pd +import mysql.connector +from mysql.connector import errorcode +import sys + +DB_CONFIG = { + 'user': 'ifm50', # 您的数据库用户名 + 'password': 'hundsun', # 您的数据库密码 + 'host': '192.168.200.128', # 您的数据库主机名 (例如: '127.0.0.1' 或 'localhost') + 'database': 'ifm50', # 您要连接的数据库名称 + 'port': 3306, + 'raise_on_warnings': True # 在警告时引发异常 +} + + +def connect_to_mysql(): + """连接到 MySQL 数据库并返回连接和游标对象。""" + try: + cnx = mysql.connector.connect(**DB_CONFIG) + cursor = cnx.cursor() + print("成功连接到 MySQL 数据库") + return cnx, cursor + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print("访问被拒绝:用户名或密码错误") + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print(f"数据库 '{DB_CONFIG['database']}' 不存在") + else: + print(f"连接 MySQL 时发生错误: {err}") + return None, None + +def genTestTxtFromExcel(demoStr, excelPath, rename_map, final_file_poath): + cnx, cursor = connect_to_mysql() + bankTATemplate = {} + templateDictMap = {} + templateList = [] + try: + cursor.execute(""" + select t.bank_no, t.seller_code, t.templet from tbbankta t + """ ) + results = cursor.fetchall() + if results: + for item in results: + bankTATemplate[item[1]] = item[2] + if item[2] not in templateList: + templateList.append(item[2]) + for template in templateList: + fileTypes = ["03"] + if str(template).startswith("WMDEP"): + fileTypes = ["005"] + if template != "WMDEP1.1": + if str(template).startswith("WMDEP"): + fileTypes.append("052") + else: + fileTypes.append("F3") + for fileType in fileTypes: + key = template+"-"+fileType + if key not in demoStr or len(demoStr[key]) == 0: + print(f"未提供[{template}+{fileType}]的正确数据!") + continue + finalLen = len((demoStr[template+"-"+fileType]).encode("gb2312")) + cursor.execute(f""" + select t.templet, t.file_type , t.field_no , t2.data_type , t2.field_len , t2.scale , t2.field_name , t.idno + from tbfieldmap t left join tbtadict t2 on t.templet = t2.templet and t.idno = t2.idno + where t.templet = '{template}' and file_type = '{fileType}' + order by t.templet, t.file_type , t.field_no + """ ) + results = cursor.fetchall() + if results: + count = 0 + dictMap = {} + for item in results: + # 0-templet,1-file_type,2-field_no,3-data_type,4-field_len,5-scale,5-field_name,7-idno + # idno : [类型, 开始位置, 长度] + dictMap[str(item[7])] = [item[0],item[1],item[2],item[3],item[4],item[5],item[6], count] + count += item[4] + if count != finalLen: + print(f"模板[{template}-{fileType}]数据库长度[{str(count)}]不等于案例长度[{str(finalLen)}]") + templateDictMap[template+"-"+fileType] = dictMap + + df = pd.read_excel(excelPath, sheet_name="Sheet2", dtype='object', keep_default_na=True) + df.rename(columns=rename_map, inplace=True) + + if "121" not in df.columns: + print("未提供销售商代码!", file=sys.stderr) + return + + notUseCol = [] + for col in df.columns: + if str(col) not in dictMap: + notUseCol.append(str(col)) + if notUseCol: + print(f"excel存在没有用到的数据idno{notUseCol}!") + sellerCount = {} + + for index, row in df.iterrows(): + fileTypes = ["03"] + if row["121"] not in bankTATemplate: + template = "WMDEP1.1" + else: + template = bankTATemplate[row["121"]] + if str(template).startswith("WMDEP"): + fileTypes = ["005"] + if template != "WMDEP1.1": + if str(template).startswith("WMDEP"): + fileTypes.append("052") + else: + fileTypes.append("F3") + for fileType in fileTypes: + final_file = final_file_poath + "final-" + row["121"] + "-"+ template +"-" + fileType + ".txt" + key = template+"-"+fileType + sellerFileType = row["121"] +"-"+ fileType + finalStr = demoStr[key] + if sellerFileType not in sellerCount: + with open(final_file, mode='w', encoding='gb2312') as f: + f.write("") + sellerCount[sellerFileType] = 0 + sellerCount[sellerFileType] += 1 + dictMap = templateDictMap[key] + for col in df.columns: + column = str(col) + if column not in dictMap: + continue + if pd.isna(row[col]): + continue + # print("字段ID[%s],偏移量[%s],长度[%s]", column, dictMap[column][7], dictMap[column][4]) + front = slice_up_to_bytes(finalStr, dictMap[column][7]) + end = slice_from_byte_offset(finalStr, dictMap[column][7]+dictMap[column][4]) + content = '' + if 'c' == dictMap[column][3].lower(): + content = pad_by_bytes(str(row[col]), dictMap[column][4], direction='right') + else: + value = str(row[col]) + if dictMap[column][5] > 0 : + # 精度大于0 + print(value, file=sys.stderr) + content = pad_by_bytes(row[col], dictMap[column][4], pad_char="0") + finalStr = front + content + end + if len(finalStr.encode("gb2312")) != finalLen: + print(f"替换["+column+f"]后长度变化[{finalLen}]->[{len(finalStr.encode("gb2312"))}]!请检查!") + print(demoStr) + print(finalStr) + break + with open(final_file, mode='a', encoding='gb2312') as f: + f.write(finalStr+"\n") + + finally: + if cnx: + cnx.close() + print("cnx 已关闭!") + if cursor: + cursor.close() + print("cursor 已关闭!") + + +def slice_up_to_bytes(text: str, max_bytes: int, encoding: str = 'gb2312') -> str: + """ + 1. 截取到 A:从字符串开始截取不超过指定字节数(A)的子字符串。 + """ + byte_count = 0 + char_index = 0 + + for i, char in enumerate(text): + char_byte_len = len(char.encode(encoding)) + if byte_count + char_byte_len > max_bytes: + break + byte_count += char_byte_len + char_index = i + 1 + + return text[:char_index] + + +def slice_from_byte_offset(text: str, start_byte: int, encoding: str = 'gb2312') -> str: + """ + 2. 从 B 开始截取:从指定的字节偏移量(B)开始,截取到字符串末尾。 + """ + bytes_passed = 0 + start_char_index = len(text) # 默认为末尾,如果start_byte超长,则返回空字符串 + + for i, char in enumerate(text): + # 如果当前字符的起始字节位置已经达到或超过了指定的偏移量 + if bytes_passed >= start_byte: + if bytes_passed > start_byte: + print("!!!") + start_char_index = i + break + + bytes_passed += len(char.encode(encoding)) + + return text[start_char_index:] + +def truncate_by_bytes(text: str, max_bytes: int, encoding: str = 'gb2312') -> str: + """ + 安全地从字符串开始截取不超过指定字节数的子字符串。 + 这是一个辅助函数,用于处理超长情况。 + """ + encoded_text = text.encode(encoding) + if len(encoded_text) <= max_bytes: + return text + + # 从字节串截断,并忽略可能产生的解码错误 + return encoded_text[:max_bytes].decode(encoding, 'ignore') + + +def pad_by_bytes( + text: str, + total_byte_length: int, + encoding: str = 'gb2312', + pad_char: str = ' ', + direction: str = 'left' +) -> str: + """ + 将字符串按指定字节长度进行补全。 + + :param text: 原始字符串。 + :param total_byte_length: 补全后的目标总字节数。 + :param encoding: 用于计算字节长度的编码。 + :param pad_char: 用于补全的字符,必须是单字节字符。 + :param direction: 补全方向, 'left' (前) 或 'right' (后)。 + :return: 补全或截断后的字符串。 + """ + # 1. 验证补全字符必须是单字节字符 + if len(pad_char.encode(encoding)) != 1: + raise ValueError(f"补全字符 '{pad_char}' 在编码 '{encoding}' 下不是单字节字符。") + + # 2. 计算当前字符串的字节长度 + current_byte_length = len(text.encode(encoding)) + + # 3. 如果当前长度超过目标长度,进行截断 + if current_byte_length > total_byte_length: + print(text+"超过限制长度:" + str(total_byte_length), file=sys.stderr) + return truncate_by_bytes(text, total_byte_length, encoding) + + # 4. 计算需要补全的字节数 + bytes_to_pad = total_byte_length - current_byte_length + + # 5. 生成补全字符串 + padding_string = pad_char * bytes_to_pad + + # 6. 根据方向进行补全 + if direction == 'right': + return text + padding_string + elif direction == 'left': + return padding_string + text + else: + raise ValueError("补全方向 (direction) 必须是 'left' 或 'right'。") + +if __name__ == "__main__": + # spide() + demoStr = { + "BZD21-03": "202504010100000000000001w130001 20250401120000jyzhsellercod0001033 00000000000000000000000030000000020 00000 156 2025040100000000 00000000 00000 0000000000 0000000000000000测试认购 0000000000000000000000000 000000000 00000000 110000 00 00000 0000000000000000 00000 0000000000000000000000000" + ,"BZD21-F3": "033 0012025040101002025040100010140802199205188210 01 C101021 0" + ,"WMDEP1.0-005": "202504010100000000000001000000 0000 156 0000000000000000 130001 0 2025040120250401120000 0 00011630000000048020 000000000000000000000000000000000000000030000000022NY0000000048 000000 0000000000000000 00000 00000测试申购 0 000 000000000000000000000 00000 0000000000000000000000000 110000 " + ,"WMDEP1.0-052": "2025040101000000000000010 130001 E90000001 00011630000000048020 NY00000000486214855862774691 110000C20250401000000000000000000001 01T20250401000000000000000000001C100020001 交通银行武汉支行 " + ,"WMDEP1.1-005": "202504010100000000000001000000 0000 156 0000000000000000 130001 0 2025040120250401120000 0 00011630000000048028 000000000000000000000000000000000000000030000000022NY0000000048 000000 0000000000000000 00000 00000测试申购 0 000 000000000000000000000 00000 0000000000000000000000000 110000 6214855862774691 110000C20250401000000000000000000001 01T20250401000000000000000000001C100020001 交通银行武汉支行 " + } + excelPath = "C:\\Users\\huzhujiang\\Desktop\\测试数据\\测试造数excel.xlsx" + final_file_poath = "C:\\Users\\huzhujiang\\Desktop\\测试数据\\" + rename_map = { + "流水号": "8", + "销售商": "121", + "客户名称": "85", + "个人/机构标志": "98", + "理财账号": "136", + "交易账号": "120", + "证件类型": "27", + "证件号码": "72", + "个人养老金账户": "11103", + "个人养老金资金账户": "11104", + "理财行业平台养老金账户": "11105", + "投资者类别": "1012", + "其他证件名称": "1013", + "合格投资者标识": "1150", + "SPV资金托管账户开户行": "1151", + "其他资金托管账户开户行": "1152", + "核心客户号": "9904", + "关联活期存款账号": "9902", + "关联活期存款账号开户行代码": "10003", + "关联活期存款账号开户行名称": "10004", + "交易日期": "92", + "性别": "126", + "投资者身份日期": "23", + "个人机构类型": "325", + "投资者户名简称": "122", + "产品代码": "67", + "交易码": "135", + } + genTestTxtFromExcel(demoStr, excelPath, rename_map, final_file_poath) + diff --git a/zsjieSpiderComplete.py b/zsjieSpiderComplete.py new file mode 100644 index 0000000..ae08342 --- /dev/null +++ b/zsjieSpiderComplete.py @@ -0,0 +1,112 @@ +import mysql.connector +from mysql.connector import errorcode +import requests +from bs4 import BeautifulSoup + + +DB_CONFIG = { + 'user': 'zsjie', # 您的数据库用户名 + 'password': 'xRekX6Cc3RRK6mBe', # 您的数据库密码 + 'host': '111.180.203.166', # 您的数据库主机名 (例如: '127.0.0.1' 或 'localhost') + 'database': 'zsjie', # 您要连接的数据库名称 + 'port': 25506, + 'raise_on_warnings': True # 在警告时引发异常 +} + +def connect_to_mysql(): + """连接到 MySQL 数据库并返回连接和游标对象。""" + try: + cnx = mysql.connector.connect(**DB_CONFIG) + cursor = cnx.cursor() + print("成功连接到 MySQL 数据库") + return cnx, cursor + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print("访问被拒绝:用户名或密码错误") + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print(f"数据库 '{DB_CONFIG['database']}' 不存在") + else: + print(f"连接 MySQL 时发生错误: {err}") + return None, None + +def getHttpContext(url): + headers = { + 'authority': 'www.zsjie.com', + 'method': 'POST', # 通常 requests 库会自动处理,但明确写出无妨 + 'path': '/wp-admin/admin-ajax.php', # 通常 requests 库会自动处理 + 'scheme': 'https', # 通常 requests 库会自动处理 + 'Accept': 'text/html,*/*;q=0.01', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + # 从图片中复制的 Cookie 值非常长,并且可能已过期或特定于某个会话。 + # 您需要确保使用有效的 Cookie。 + # 例如: 'PHPSESSID=your_session_id; wordpress_logged_in_...=your_login_cookie_value' + 'Cookie': 'wordpress_012aa954c15bd0242a65bd94f397e616=xiaohu123%7C1750330106%7CFksFn5yihGolM16iilCGsndgKLMG7gJNsDOfHU2ubT0%7C61270247574c98cc268948c3c7c816a59384f854f68a4628f7cf28eac5cbe494; wordpress_sec_012aa954c15bd0242a65bd94f397e616=xiaohu123%7C1750330106%7CrTY9VDsB6sR3epotTH0AkYKpC58mmV6mQL1zF8aD92m%7Cb0bccbd512648a4019f6cdd38fd79dd95d2ee24b9c5e15f4f32682699306209e; PHPSESSID=5eb3ga6pcdliincbicktc0tvbe; wordpress_logged_in_012aa954c15bd0242a65bd94f397e616=xiaohu123%7C1750330106%7CrTY9VDsB6sR3epotTH0AkYKpC58mmV6mQL1zF8aD92m%7C516a19884972ca894cbaef06c0b5008763ba65df9d1118c1bccdba93d14368c6', + 'Origin': 'https://www.zsjie.com', + 'Priority': 'u=1, i', # 这个头可能不是所有情况都必须 + 'Referer': 'https://www.zsjie.com/61007.html', # 指示请求的来源页面 + 'Sec-Ch-Ua': '"Chromium";v="136", "Microsoft Edge";v="136", "Not.A/Brand";v="99"', + 'Sec-Ch-Ua-Mobile': '?0', # ?0 表示非移动设备 + 'Sec-Ch-Ua-Platform': '"Windows"', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0', + 'X-Requested-With': 'XMLHttpRequest' # 表明这是一个 AJAX 请求 + } + payload1 = {'action':'add_post_views_num', 'id':'61007'} + payload = {'action':'get_async_shop_down', 'post_id':'61007'} + response = requests.post( url=url, headers=headers, params=payload1, verify=False) + print(response.text) + + response = requests.post( url=url, headers=headers, params=payload, verify=False) + print(response.text) + soup = BeautifulSoup(response.text, 'html.parser') + div_elements = soup.find_all('div', class_='btn-group btn-block mt-2') + target_link_prefix = "https://www.zsjie.com/goto?" + extracted_links = [] + # if not div_elements: + # print(f"未找到 class div 元素。") + # print(f"找到了 {len(div_elements)} 个 div 元素。") + # for div in div_elements: + # # 在每个符合条件的 div 中查找所有的 a 标签 + # a_tags = div.find_all('a', href=True) # href=True 确保只选择有 href 属性的 a 标签 + # if not a_tags: + # print(f"在 div 中未找到 标签。") + # continue + # + # for a_tag in a_tags: + # href = a_tag['href'] + # # 检查 href 是否以目标前缀开头 + # if href.startswith(target_link_prefix): + # extracted_links.append(href) + # print(f" 提取到链接: {href}") + # else: + # print(f" 跳过链接 (前缀不匹配): {href}") + + + +def spide(): + cnx, cursor = connect_to_mysql() + try: + cursor.execute("select * from resource where resource_url =' ' and available_flag !='0' order by id desc limit 0,50 ") + results = cursor.fetchall() + if results: + for item in results: + print(item) + getHttpContext(item[1]) + + finally: + if cnx: + cnx.close() + print("cnx 已关闭!") + if cursor: + cursor.close() + print("cursor 已关闭!") + + + +if __name__ == "__main__": + # spide() + getHttpContext('https://www.zsjie.com/wp-admin/admin-ajax.php')