01495251 1 mēnesi atpakaļ
vecāks
revīzija
715c857267

+ 0 - 1
crossborder/fujian/fujian_parse_excel.py

@@ -17,7 +17,6 @@ FUJIAN_CITY = {
 "南平市": "350700",
 "宁德市": "350900",
 "龙岩市": "350800",
-"平潭地区": "350128"
 }
 
 

+ 3 - 2
crossborder/guangdong/guangdong_sub_customs_parse_excel.py

@@ -420,13 +420,14 @@ def process_zhanjiang_customs(file_path, year, month):
     # 判断是否应执行核心逻辑
     if _zhanjiang_first_month is None:
         # 第一次调用,记录初始月份
-        _zhanjiang_first_month = month
+        _zhanjiang_first_month = (year, month)
         should_execute = True
     else:
         # 后续调用仅在以下情况下执行:
         # - 与初次调用的 month 相同(允许多城市同时处理)
         # - 或者 month == 12
-        should_execute = (month == _zhanjiang_first_month) or (month == 12)
+        current_flag = (year, month)
+        should_execute = (current_flag == _zhanjiang_first_month) or (month == 12)
 
     if not should_execute:
         log.warning(f"跳过湛江海关{year}年{month}文件: {file_path.name}")

+ 5 - 1
crossborder/guangdong/selenium_guangdong_download.py

@@ -4,6 +4,7 @@ import re
 import time
 from datetime import datetime, timedelta
 
+import numpy as np
 import pandas as pd
 from selenium import webdriver
 from selenium.common import TimeoutException
@@ -240,6 +241,8 @@ def parse_commodity_table(driver, data_type, year, month):
 
             # 清洗商品名称(处理 和空格)
             name = clean_commodity_name(cols[0])
+            if name == '肉类':
+                name = '肉类(包括杂碎)'
 
             if month == 2:
                 # 处理合并后的1月和2月数据
@@ -298,7 +301,7 @@ def merge_commodity_data(import_data, export_data, year, month):
         df_export,
         on=['commodity_code',  'commodity_name',  'crossborder_year_month'],
         how='outer'
-    ).fillna(0)
+    )
 
     # 计算总量(可选,根据表结构需求)
     merged_df['monthly_total'] = merged_df['monthly_import'] + merged_df['monthly_export']
@@ -353,6 +356,7 @@ def reverse_crawler(driver, target_months):
                     # 确保同时有进口和出口数据
                     if export_data and import_data:
                         final_df = merge_commodity_data(export_data, import_data, year, month)
+                        final_df = final_df.replace({np.nan: None})
                         db.bulk_insert(df=final_df, table_name='t_yujin_crossborder_prov_commodity_trade',
                                       conflict_columns=['commodity_code', 'crossborder_year_month'],
                                       update_columns=['monthly_import', 'monthly_export', 'monthly_total'])

+ 30 - 3
crossborder/quanguo/data_cleaning_to_db.py

@@ -128,10 +128,37 @@ def main():
                     if not file.endswith(('.xls', '.xlsx')):
                         continue
 
-                    if '(8)' in file:
+                    # if '(1)' in file and '年度表' in file:
+                    #     log.info(f"处理年度汇总表: {file}")
+                    #     parse_year_table_excel(full_path)
+                    #
+                    # elif '(1)' in file and '月度表' in file:
+                    #     log.info(f"处理月度汇总表: {file}")
+                    #     parse_month_table_excel(full_path)
+                    #
+                    # elif '(2)' in file:
+                    #     log.info(f"处理国别(地区)贸易表: {file}")
+                    #     parse_country_table_excel(full_path)
+                    #
+                    # elif '(4)' in file:
+                    #     log.info(f"处理类章贸易表: {file}")
+                    #     parse_commodity_table_excel(full_path)
+
+                    elif '(8)' in file:
                         log.info(f"处理收发货人所在地表: {file}")
                         parse_region_table_excel(full_path)
 
+                    # elif '(15)' in file:
+                    #     log.info(f"处理对部分国家(地区)出口类章金额表: {full_path}")
+                    #     parse_commodity_country_detail(full_path, "export")
+                    #
+                    # elif '(16)' in file:
+                    #     log.info(f"处理自部分国家(地区)进口类章金额表: {full_path}")
+                    #     parse_commodity_country_detail(full_path, "import")
+                    #
+                    # else:
+                    #     log.warning(f"未知类型文件,跳过: {full_path}")
+
             log.info(f"{year} 年的数据处理完成!")
 
         except Exception as e:
@@ -143,5 +170,5 @@ def main():
 
 
 if __name__ == "__main__":
-    # process_all_region_tables()
-    perform_data_cleanup_and_import(2025)
+    main()
+    # perform_data_cleanup_and_import(2025)

+ 7 - 0
crossborder/quanguo/parse_year_excel.py

@@ -3,6 +3,7 @@ import xlrd
 from crossborder.utils.db_helper import DBHelper
 from crossborder.utils.parse_utils import convert_unit, parse_ratio
 
+_parse_executed = False  # 模块级变量,控制执行次数
 
 def get_upsert_sql():
     """使用命名占位符并正确使用VALUES函数的SQL"""
@@ -26,6 +27,11 @@ def get_upsert_sql():
 
 
 def parse_year_table_excel(file):
+    global _parse_executed
+    if _parse_executed:
+        print("⚠️ parse_year_table_excel 已执行过,不再重复执行")
+        return
+
     db_helper = DBHelper()
     current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
@@ -63,6 +69,7 @@ def parse_year_table_excel(file):
     try:
         affected_rows = db_helper.execute_sql_with_params(sql, params_list)
         print(f"成功处理 {len(params_list)} 条数据,受影响行数:{affected_rows}")
+        _parse_executed = True
     except Exception as e:
         print(f"数据库操作失败: {e}")
         raise

+ 49 - 29
crossborder/quanguo/selenium_download.py

@@ -1,26 +1,19 @@
 import argparse
-import os
 import random
 import re
 import time
 from datetime import datetime
-from pathlib import Path
 
-from faker import Faker
 from selenium import webdriver
 from selenium.common import StaleElementReferenceException
-from selenium.webdriver import FirefoxOptions, ActionChains
+from selenium.webdriver import ActionChains
+from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import
-from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel
-from crossborder.quanguo.parse_month_excel import parse_month_table_excel
-from crossborder.quanguo.parse_year_excel import parse_year_table_excel
 from crossborder.utils.constants import DOWNLOAD_DIR
-from crossborder.utils.download_utils import configure_stealth_options, wait_for_download, download_excel
-from selenium.webdriver.common.by import By
-
+from crossborder.utils.download_utils import configure_stealth_options, download_excel
 from crossborder.utils.log import log
 
 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
@@ -40,36 +33,62 @@ def generate_table_title(year):
         f"(16){year}年自部分国家(地区)进口商品类章金额表"
     ]
 
+
 def process_table_row(row):
-    """动态处理表格行数据(Selenium语法)"""
+    """更健壮的表格行处理函数"""
     try:
-        # 获取所有表格单元格(td)元素
-        cells = row.find_elements(By.TAG_NAME, 'td')
+        # 使用相对定位获取单元格
+        cells = WebDriverWait(row, 15).until(
+            EC.presence_of_all_elements_located((By.XPATH, "./td"))
+        )
+
         if len(cells) < 2:
             return None
 
-        # 获取表格名
-        table_name = cells[0].text.strip()
+        # 使用文本内容稳定性检查
+        table_name = None
+        for attempt in range(3):
+            try:
+                table_name = cells[0].text.strip()
+                if table_name:  # 确认文本稳定获取
+                    break
+            except StaleElementReferenceException:
+                # 重新获取单元格
+                cells = row.find_elements(By.XPATH, "./td")
+                if len(cells) < 2:
+                    return None
+                time.sleep(0.5)
+
+        if not table_name:
+            return None
 
-        # 获取第二列中的所有链接,提取月份和href
+        # 月份链接处理(使用更稳定的XPATH)
         month_links = []
-        links = cells[1].find_elements(By.TAG_NAME, 'a')
-        for a in links:
-            # 获取文本并去掉‘月’
-            month_text = a.text
-            if '月' in month_text:
-                month = int(month_text.replace('月', '').strip())
-                href = a.get_attribute('href')
-                if href:
-                    month_links.append((month, href))
-
-        # 按月份倒序排列(1-12月)
-        month_links.sort(key=lambda x: x[0], reverse=True)
+        link_elements = cells[1].find_elements(By.XPATH, ".//a")
+
+        for a in link_elements:
+            try:
+                # 添加临时等待避免元素状态变化
+                time.sleep(0.2)
+                month_text = a.text.strip()
+                if '月' in month_text:
+                    month = int(month_text.replace('月', '').strip())
+                    href = a.get_attribute('href')
+                    if href:
+                        month_links.append((month, href))
+            except StaleElementReferenceException:
+                continue  # 跳过已无效的链接
+            except Exception as e:
+                log.debug(f"处理链接异常: {str(e)}")
+
+        # 如果获取到链接再排序
+        if month_links:
+            month_links.sort(key=lambda x: x[0], reverse=True)
 
         return (table_name, month_links)
 
     except Exception as e:
-        log.info(f"表格行处理异常: {str(e)}")
+        log.info(f"表格行处理异常: {str(e)}", exc_info=True)
         return None
 
 
@@ -181,3 +200,4 @@ if __name__ == "__main__":
         log.info("【海关总署】全年数据抓取结束".center(66, "*"))
         log.info("\n数据清洗入库中...")
         perform_data_cleanup_and_import(current_year)
+        log.info("\n数据清洗入库完毕...")

+ 6 - 3
crossborder/utils/download_utils.py

@@ -106,14 +106,17 @@ def download_excel(driver, url, year, month, title, download_dir):
         ActionChains(driver).move_to_element(download_btn).pause(0.3).click().perform()
         # log.info(f"√ 已点击下载按钮:{download_btn.get_attribute("href")}")
 
-        downloaded_file  = wait_for_download(download_dir)
         final_dir = Path(f'{download_dir}/{year}/{month:02d}')
-        final_path = Path(f'{final_dir}/{title}{downloaded_file.suffix}')
         # 删除 final_dir 中所有与 base_name 同名但不同后缀的文件
         for old_file in final_dir.glob(f'{title}.*'):
             if old_file.is_file():
                 old_file.unlink()
-                log.info(f"已删除旧文件:{old_file}")
+                log.debug(f"已删除旧文件:{old_file}")
+
+        downloaded_file  = wait_for_download(download_dir)
+
+        final_path = Path(f'{final_dir}/{title}{downloaded_file.suffix}')
+
         download_rel_dir = Path(f'{download_dir}/{year}/{month:02d}')
         download_rel_dir.mkdir(parents=True, exist_ok=True)
         downloaded_file.rename(final_path)

+ 4 - 3
pyproject.toml

@@ -9,8 +9,8 @@ license = {text = "Compatible Python versions [^3.6]: ^3.8"}
 readme = "README.md"
 requires-python = "^3.10"
 dependencies = [
-    "requests (>=2.32.3,<3.0.0)",
-    "attrs (>=25.3.0)",
+    "requests >=2.32.3,<3.0.0",
+    "attrs >=25.3.0",
     "automat (>=25.4.16)",
     "beautifulsoup4 (>=4.13.4)",
     "bs4 (>=0.0.2)",
@@ -82,7 +82,8 @@ dependencies = [
     "websocket-client (>=1.8.0)",
     "wsproto (>=1.2.0)",
     "xlrd (>=2.0.1)",
-    "zope-interface (>=7.2)"
+    "zope-interface (>=7.2)",
+    "pycryptodome (>=3.23.0,<4.0.0)",
 ]
 
 [tool.poetry.scripts]