01495251 1 неделя назад
Сommit
bed8334e32
47 измененных файлов с 6157 добавлено и 0 удалено
  1. 1 0
      .gitignore
  2. 8 0
      .idea/.gitignore
  3. 10 0
      .idea/Crossborder.iml
  4. 6 0
      .idea/inspectionProfiles/profiles_settings.xml
  5. 6 0
      .idea/misc.xml
  6. 8 0
      .idea/modules.xml
  7. 212 0
      db_helper.py
  8. 0 0
      fujian/__init__.py
  9. 307 0
      fujian/fujian_parse_excel.py
  10. 200 0
      fujian/selenium_fujian_download.py
  11. 0 0
      guangdong/__init__.py
  12. 11 0
      guangdong/cross.log
  13. 291 0
      guangdong/selenium_guangdong_city.py
  14. 440 0
      guangdong/selenium_guangdong_download.py
  15. 0 0
      henan/__init__.py
  16. 265 0
      henan/henan_parse_excel.py
  17. 233 0
      henan/selenium_henan_download.py
  18. 469 0
      quanguo/CountryTrade.py
  19. 178 0
      quanguo/CountryTradeYear.py
  20. 147 0
      quanguo/CrossDownload.py
  21. 148 0
      quanguo/CrossDownloadYear.py
  22. 38 0
      quanguo/ScrpyDownload.py
  23. 0 0
      quanguo/__init__.py
  24. 87 0
      quanguo/commodity_trade.py
  25. 193 0
      quanguo/commodity_trade_year.py
  26. BIN
      quanguo/converted.xlsx
  27. BIN
      quanguo/converted_export.xlsx
  28. BIN
      quanguo/converted_import.xlsx
  29. 154 0
      quanguo/customs_data.json
  30. 166 0
      quanguo/detail.py
  31. 167 0
      quanguo/detail_year.py
  32. 126 0
      quanguo/monthData.py
  33. 111 0
      quanguo/monthData2023.py
  34. 90 0
      quanguo/pc.py
  35. 140 0
      quanguo/region_trade.py
  36. 190 0
      quanguo/region_trade2024.py
  37. 107 0
      quanguo/yearData.py
  38. 231 0
      selenium_download.py
  39. 0 0
      shandong/__init__.py
  40. 192 0
      shandong/selenium_shandong_download.py
  41. 207 0
      shandong/selenium_shandong_read.py
  42. 309 0
      shandong/shandong_parse_excel.py
  43. 0 0
      utils/__init__.py
  44. 332 0
      utils/constants.py
  45. 216 0
      utils/download_utils.py
  46. 25 0
      utils/log.py
  47. 136 0
      utils/parse_utils.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+/downloads/

+ 8 - 0
.idea/.gitignore

@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

+ 10 - 0
.idea/Crossborder.iml

@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.13 (Crossborder)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 6 - 0
.idea/misc.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.13 (Crossborder)" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Crossborder.iml" filepath="$PROJECT_DIR$/.idea/Crossborder.iml" />
+    </modules>
+  </component>
+</project>

+ 212 - 0
db_helper.py

@@ -0,0 +1,212 @@
+
+
+from sqlalchemy import create_engine, text
+import logging
+import pymysql
+import pandas as pd
+
+# DB_CONFIG = {
+#     'host': '10.130.75.149',
+#     'port': 3307,
+#     'user': 'yto_crm',
+#     'password': '%3sFUlsolaRI',
+#     'database': 'crm_uat',
+#     'charset': 'utf8mb4'
+# }
+
+DB_CONFIG = {
+    'host': '10.130.36.185',
+    'port': 3306,
+    'user': 'user_ytexp',
+    'password': 'Rn9ib3L1C4b4%40123',
+    'database': 'yto_crm',
+    'charset': 'utf8mb4'
+}
+
+
+
+class DBHelper:
+    def __init__(self):
+        self.engine = create_engine(
+            f'mysql+pymysql://{DB_CONFIG["user"]}:{DB_CONFIG["password"]}@{DB_CONFIG["host"]}:{DB_CONFIG["port"]}/{DB_CONFIG["database"]}?charset={DB_CONFIG["charset"]}',
+            pool_size=5,
+            max_overflow=10
+        )
+
+    def get_commodity_id(self, name):
+        """获取商品编码对应的分类ID[1,3](@ref)"""
+        with self.engine.connect() as conn:
+            result = conn.execute(
+                text("SELECT id FROM t_yujin_crossborder_prov_commodity_category WHERE commodity_name = :name"),
+                {'name': name}
+            ).fetchone()
+            return result[0] if result else None
+
+    def bulk_insert(self, df, table_name, conflict_columns=None, update_columns=None):
+        """
+        增强版批量插入(支持覆盖更新)
+        :param df: 要插入的DataFrame
+        :param table_name: 目标表名
+        :param conflict_columns: 冲突检测字段列表
+        :param update_columns: 需要更新的字段列表
+        """
+        if df.empty:
+            print("空数据集,跳过插入")
+            return
+
+        # 生成带参数的SQL模板
+        columns = ', '.join(df.columns)
+        placeholders = ', '.join([f":{col}" for col in df.columns])
+        sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
+
+        # 添加ON DUPLICATE KEY UPDATE(MySQL语法)
+        if conflict_columns and update_columns:
+            update_set = ', '.join([f"{col}=VALUES({col})" for col in update_columns])
+            sql += f" ON DUPLICATE KEY UPDATE {update_set}"
+
+        # 转换数据为字典列表格式
+        data = df.to_dict(orient='records')
+        # print("data:", data)
+        try:
+            with self.engine.connect() as conn:
+                # 显式开启事务
+                with conn.begin():
+                    # 使用text()包装SQL语句
+                    stmt = text(sql)
+
+                    # 批量执行
+                    conn.execute(stmt, data)
+
+                    print(f"成功插入/更新 {len(df)} 行到 {table_name}")
+        except Exception as e:
+            print(f"数据库操作失败: {str(e)}")
+            raise
+
+    def update_january_yoy(self, prov_name='福建省'):
+        """
+        更新指定省份1月份同比数据
+        :param prov_name: 省份名称,默认为福建省
+        """
+        update_sql = text("""
+                          UPDATE t_yujin_crossborder_prov_region_trade AS curr
+                              INNER JOIN t_yujin_crossborder_prov_region_trade AS prev
+                          ON curr.city_code = prev.city_code
+                              AND prev.crossborder_year_month = DATE_FORMAT(
+                              DATE_SUB(
+                              STR_TO_DATE(CONCAT(curr.crossborder_year_month, '-01'), '%Y-%m-%d'),
+                              INTERVAL 1 YEAR
+                              ),
+                              '%Y-01'
+                              )
+                              SET
+                                  curr.yoy_import_export = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_total - prev.monthly_total) / NULLIF (prev.monthly_total, 0) * 100, 4
+                                  ), 0.0000
+                                  ), curr.yoy_import = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_import - prev.monthly_import) / NULLIF (prev.monthly_import, 0) * 100, 4
+                                  ), 0.0000
+                                  ), curr.yoy_export = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_export - prev.monthly_export) / NULLIF (prev.monthly_export, 0) * 100, 4
+                                  ), 0.0000
+                                  )
+                          WHERE
+                              curr.prov_name = :prov_name
+                            AND curr.crossborder_year_month LIKE '%-01'
+                            AND curr.crossborder_year_month
+                              > '2023-01'
+                          """)
+
+        try:
+            with self.engine.begin() as conn:
+                result = conn.execute(update_sql, {'prov_name': prov_name})
+                print(f"Updated {result.rowcount} rows for {prov_name}")
+                return result.rowcount
+
+        except Exception as e:
+            print(f"Update failed: {str(e)}")
+            raise RuntimeError(f"同比数据更新失败: {str(e)}") from e
+
+    def clear_old_shandong_yoy(self):
+        """
+        清理山东省2024年前数据的同比指标
+        """
+        clear_sql = text("""
+                         UPDATE t_yujin_crossborder_prov_region_trade
+                         SET yoy_import_export = 0.0000,
+                             yoy_export        = 0.0000,
+                             yoy_import        = 0.0000
+                         WHERE prov_name = '山东省'
+                           AND crossborder_year_month < '2024-01'
+                           AND (yoy_import_export != 0 
+               OR yoy_export != 0 
+               OR yoy_import != 0) -- 优化:仅更新非零记录
+                         """)
+
+        try:
+            with self.engine.begin() as conn:
+                result = conn.execute(clear_sql)
+                print(f"山东省旧数据清零记录数: {result.rowcount}")
+                return result.rowcount
+        except Exception as e:
+            self.logger.error(f"旧数据清零失败: {str(e)}")
+            raise
+
+    def update_shandong_yoy(self):
+        """
+        完整更新山东省同比数据(包含新旧数据处理)
+        """
+        try:
+            # 步骤1:清理旧数据
+            cleared = self.clear_old_shandong_yoy()
+
+            # 步骤2:计算新数据
+            updated = self._update_shandong_new_yoy()
+
+            print(f"山东省同比处理完成 | 清零:{cleared} 更新:{updated}")
+            return {'cleared': cleared, 'updated': updated}
+        except Exception as e:
+            print("山东省数据处理失败", exc_info=True)
+            raise
+
+    def _update_shandong_new_yoy(self):
+        """
+        处理2024年及之后的山东省数据(内部方法)
+        """
+        update_sql = text("""
+                          UPDATE t_yujin_crossborder_prov_region_trade AS curr
+                              INNER JOIN t_yujin_crossborder_prov_region_trade AS prev
+                          ON curr.city_code = prev.city_code
+                              AND prev.crossborder_year_month = DATE_FORMAT(
+                              DATE_SUB(
+                              STR_TO_DATE(CONCAT(curr.crossborder_year_month, '-01'), '%Y-%m-%d'),
+                              INTERVAL 1 YEAR
+                              ),
+                              '%Y-%m'
+                              )
+                              SET
+                                  curr.yoy_import_export = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_total - prev.monthly_total) / NULLIF (prev.monthly_total, 0) * 100, 4
+                                  ), 0.0000
+                                  ), curr.yoy_import = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_import - prev.monthly_import) / NULLIF (prev.monthly_import, 0) * 100, 4
+                                  ), 0.0000
+                                  ), curr.yoy_export = COALESCE (
+                                  ROUND(
+                                  (curr.monthly_export - prev.monthly_export) / NULLIF (prev.monthly_export, 0) * 100, 4
+                                  ), 0.0000
+                                  )
+                          WHERE
+                              curr.prov_name = '山东省'
+                            AND curr.crossborder_year_month >= '2024-01'
+                            AND prev.monthly_total IS NOT NULL
+                          """)
+
+        with self.engine.begin() as conn:
+            result = conn.execute(update_sql)
+            print(f"山东省新数据更新数: {result.rowcount}")
+            return result.rowcount

+ 0 - 0
fujian/__init__.py


+ 307 - 0
fujian/fujian_parse_excel.py

@@ -0,0 +1,307 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from db_helper import DBHelper
+from utils.constants import DOWNLOAD_DIR
+from utils.parse_utils import convert_wan_to_yuan, extract_year_month_from_path, traverse_and_process
+
+FUJIAN_CITY = {
+"福州市": "350100",
+"厦门市": "350200",
+"莆田市": "350300",
+"三明市": "350400",
+"泉州市": "350500",
+"漳州市": "350600",
+"南平市": "350700",
+"宁德市": "350900",
+"龙岩市": "350800",
+"平潭地区": "350128"
+}
+
+
+# 常量配置(新增路径正则校验)
+PROV_CODE = "350000"
+PROV_NAME = "福建省"
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+download_dir = DOWNLOAD_DIR / "fujian"
+
+
+def parse_excel(current_dir):
+    """主解析入口(优化为单参数模式)
+    Args:
+        current_dir (str): 当前月份数据目录(格式:/年份/省份/月份)
+    """
+    current_path = Path(current_dir)
+    year, month = extract_year_month_from_path(current_path)
+
+    try:
+
+        # 处理商品贸易数据
+        current_file_path = next(current_path.glob("*分地市*"), None)
+        process_region_trade(current_file_path, year, month)
+        print(f"{current_dir}数据已全部成功处理")
+    except Exception as e:
+        print(f"处理失败:{current_dir},错误:{str(e)}")
+        raise
+
+
+def process_region_trade(current_file_path, year, month):
+    """处理地市贸易数据(增强1月逻辑 + 多sheet处理)"""
+
+    # 动态选择列配置
+    usecols = (
+        list(range(7))
+        if (year == 2023 and month <= 5)
+        else [0, 3, 4, 7, 8, 11, 12]
+    )
+    #2023年5月之前的表格数据,单月数据在第二个sheet页
+    sheet_index = 1 if (year == 2023 and month <= 5) else 0
+
+    # 读取并处理主数据表
+    current_df = load_and_process_data(
+        current_file_path, year, month, usecols , sheet_index
+    )
+
+    # 数据库写入
+    db = DBHelper()
+    bulk_insert_data(
+        db, current_df,
+        conflict_cols=['crossborder_year_month', 'city_code'],
+        update_cols=[
+            'monthly_total', 'monthly_import', 'monthly_export',
+            'yoy_import_export', 'yoy_import', 'yoy_export'
+        ]
+    )
+
+    # 二月特殊处理逻辑
+    if month == 2:
+        print(f"根据2月表格生成{year}年1月数据...")
+        handle_february_special_case(db, current_file_path, year, usecols)
+
+
+def load_and_process_data(file_path, year, month, usecols, sheet_index = 0):
+    """通用数据加载处理流程"""
+    df = pd.read_excel(
+        file_path,
+        header=4,
+        sheet_name=sheet_index,
+        usecols=usecols,
+        names=[
+            'city_name', 'monthly_total', 'yoy_import_export',
+            'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import'
+        ]
+    )
+
+    # 数据清洗流程
+    df = (
+        df.pipe(clean_city_names)
+        .pipe(map_city_codes)
+        .pipe(add_metadata, year, month)
+        .pipe(convert_units)
+    )
+    return df
+
+
+def handle_february_special_case(db, file_path, year, usecols):
+    """二月数据处理特殊逻辑"""
+    try:
+        if year == 2023:
+            process_2023_february(db, file_path, usecols)
+        else:
+            process_regular_february(db, file_path, year)
+    except Exception as e:
+        print(f"生成模拟1月数据失败: {str(e)}")
+
+
+# ---------- 工具函数 ----------
+def clean_city_names(df):
+    """清洗城市名称"""
+    df['city_name'] = (
+        df['city_name']
+        .str.replace(r'[(].*?[)]', '', regex=True)
+        .str.strip()
+    )
+    return df
+
+
+def map_city_codes(df):
+    """映射城市编码"""
+    df['city_code'] = df['city_name'].map(FUJIAN_CITY)
+    return df[df['city_code'].notnull()].copy()
+
+
+def add_metadata(df, year, month):
+    """添加元数据字段"""
+    return df.assign(
+        crossborder_year=year,
+        crossborder_year_month=f"{year}-{month:02d}",
+        prov_code=PROV_CODE,
+        prov_name=PROV_NAME
+    )
+
+
+def convert_units(df):
+    """单位转换(万→元)"""
+    for col in ['monthly_total', 'monthly_import', 'monthly_export']:
+        df[col] = df[col].apply(convert_wan_to_yuan)
+    return df
+
+
+def bulk_insert_data(db, df, conflict_cols, update_cols):
+    """批量数据插入"""
+    db.bulk_insert(
+        df,
+        't_yujin_crossborder_prov_region_trade',
+        conflict_columns=conflict_cols,
+        update_columns=update_cols
+    )
+
+
+# ---------- 二月特殊处理逻辑 ----------
+def process_2023_february(db, file_path, usecols):
+    """2023年特殊处理逻辑"""
+    # 读取双sheet数据
+    ytd_df = load_sheet_data(file_path, sheet_index=0, usecols=usecols)
+    current_df = load_sheet_data(file_path, sheet_index=1, usecols=usecols)
+
+    # 合并计算差值
+    merged = ytd_df.merge(
+        current_df,
+        on='city_code',
+        suffixes=('_ytd', '_current')
+    )
+
+    # 生成一月数据
+    january_df = create_january_data(merged, year=2023)
+
+    bulk_insert_data(
+        db, january_df,
+        conflict_cols=['crossborder_year_month', 'city_code'],
+        update_cols=[
+            'monthly_total', 'monthly_import', 'monthly_export',
+            'yoy_import_export', 'yoy_import', 'yoy_export'
+        ]
+    )
+
+
+def process_regular_february(db, file_path, year):
+    """常规年份二月处理"""
+    df = pd.read_excel(
+        file_path,
+        header=4,
+        usecols=[0, 1, 3, 4, 5, 7, 8, 9, 11, 12],
+        names=[
+            'city_name', 'ytd_monthly_total', 'monthly_total', 'yoy_import_export',
+            'ytd_monthly_export', 'monthly_export', 'yoy_export',
+            'ytd_monthly_import', 'monthly_import', 'yoy_import'
+        ]
+    )
+
+    # 完整处理流程
+    processed_df = (
+        df.pipe(clean_city_names)
+        .pipe(map_city_codes)
+        .pipe(convert_special_units)
+        .pipe(calculate_january_values)
+        .pipe(add_metadata, year=year, month=1)
+    )
+
+    bulk_insert_data(
+        db, processed_df,
+        conflict_cols=['crossborder_year_month', 'city_code'],
+        update_cols=[
+            'monthly_total', 'monthly_import', 'monthly_export',
+            'yoy_import_export', 'yoy_import', 'yoy_export'
+        ]
+    )
+
+
+def load_sheet_data(file_path, sheet_index, usecols):
+    """加载指定sheet数据"""
+    df = pd.read_excel(
+        file_path,
+        sheet_name=sheet_index,
+        header=4,
+        usecols=usecols,
+        names=[
+            'city_name', 'monthly_total', 'yoy_import_export',
+            'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import'
+        ]
+    )
+    return (
+        df.pipe(clean_city_names)
+        .pipe(map_city_codes)
+        .pipe(convert_units)
+    )
+
+def create_january_data(merged_df, year):
+    """生成一月数据(精确控制输出列)"""
+    return (
+        merged_df
+        # 步骤1:计算新字段
+        .assign(
+            monthly_total=lambda x: x['monthly_total_ytd'] - x['monthly_total_current'],
+            monthly_export=lambda x: x['monthly_export_ytd'] - x['monthly_export_current'],
+            monthly_import=lambda x: x['monthly_import_ytd'] - x['monthly_import_current'],
+            yoy_import_export=0.0,
+            yoy_export=0.0,
+            yoy_import=0.0,
+            crossborder_year_month=f"{year}-01",
+            city_name=lambda x:  x['city_name_current']
+        )
+        # 步骤2:精确选择输出列(关键修复)
+        .reindex(columns=[
+            'city_code', 'city_name',
+            'monthly_total', 'monthly_export', 'monthly_import',
+            'yoy_import_export', 'yoy_export', 'yoy_import',
+            'crossborder_year_month'
+        ])
+        # 步骤3:合并元数据(确保字段完整)
+        .assign(
+            crossborder_year=year,
+            prov_code=PROV_CODE,
+            prov_name=PROV_NAME
+        )
+    )
+
+
+def convert_special_units(df):
+    """特殊单位转换"""
+    for col in [
+        'ytd_monthly_total', 'monthly_total',
+        'ytd_monthly_export', 'monthly_export',
+        'ytd_monthly_import', 'monthly_import'
+    ]:
+        df[col] = df[col].apply(convert_wan_to_yuan)
+    return df
+
+
+def calculate_january_values(df):
+    """计算一月数值"""
+    return df.assign(
+        monthly_total=lambda x: x['ytd_monthly_total'] - x['monthly_total'],
+        monthly_export=lambda x: x['ytd_monthly_export'] - x['monthly_export'],
+        monthly_import=lambda x: x['ytd_monthly_import'] - x['monthly_import'],
+        yoy_import_export=0.0,
+        yoy_export=0.0,
+        yoy_import=0.0
+    ).drop(columns=[
+            'ytd_monthly_total', 'ytd_monthly_export',
+            'ytd_monthly_import'
+    ])
+
+
+# def clean_commodity_name(name):
+#     return re.sub(r'[^\w\u4e00-\u9fa5]', '', str(name)).strip()
+
+
+
+if __name__ == "__main__":
+    traverse_and_process(download_dir, parse_excel, province_name="fujian")
+    print("更新同比数据……")
+    db_helper = DBHelper()
+    db_helper.update_january_yoy()
+    # parse_excel(download_dir/"2023"/"02")

+ 200 - 0
fujian/selenium_fujian_download.py

@@ -0,0 +1,200 @@
+import argparse
+import random
+import time
+from datetime import datetime, timedelta
+
+from selenium import webdriver
+from selenium.common import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from fujian.fujian_parse_excel import parse_excel
+from utils.constants import DOWNLOAD_DIR
+from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
+from utils.parse_utils import traverse_and_process
+
+# 基础配置
+
+MAX_RETRY = 3
+BASE_URL = "http://fuzhou.customs.gov.cn/fuzhou_customs/zfxxgk19/2963574/2963954/484131/index.html"
+download_dir = DOWNLOAD_DIR / "fujian"
+
+
+
+def detect_latest_month(driver):
+    """三级回溯智能检测最新有效月份"""
+    driver.get(BASE_URL)
+    current_date = datetime.now()
+    for offset in range(0, 3):
+        check_date = current_date - timedelta(days=offset * 30)
+        check_year = check_date.year
+        check_month = check_date.month
+
+        target_title = f"{check_year}年{check_month}月和1-{check_month}月福建省外贸进出口情况表(分地市)"
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
+            )
+            print(f"已找到最新月份数据 {check_year}-{check_month}")
+            return check_year, check_month
+        except:
+            print(f"未找到 {target_title}")
+            continue
+    raise Exception("三个月内未找到有效数据")
+
+
+
+def process_month_data(driver, year, month):
+    """
+    处理地市贸易数据(增强1月逻辑 + 下载失败重试机制)
+    """
+    required_title = f"{year}年{month}月和1-{month}月福建省外贸进出口情况表(分地市)"
+    found_count = 0
+
+    # 获取所有匹配的链接
+    links = driver.find_elements(By.XPATH, '//a[contains(@title,"福建省")]')
+
+    for link in links:
+        title = link.get_attribute("title")
+
+        if title == required_title:
+            url = link.get_attribute("href")
+            retry = 0
+            success = False
+
+            while retry < MAX_RETRY and not success:
+                try:
+                    download_excel(driver, url, year, month, title, download_dir)
+                    found_count += 1
+                    success = True
+                    time.sleep(random.uniform(0.5, 1.5))  # 成功后等待
+                except Exception as e:
+                    retry += 1
+                    print(f"下载 {title} 失败(第{retry}次重试): {str(e)}")
+                    if retry >= MAX_RETRY:
+                        print(f"❌ 超出最大重试次数,跳过该文件:{title}")
+                        return 1000
+                    else:
+                        print(f"🔄 第{retry}次重试:{title}")
+                        time.sleep(random.uniform(2, 4))  # 重试前随机等待
+
+    print(f"本页找到{found_count}个有效表格")
+    return found_count
+
+
+def reverse_crawler(driver, target_months):
+    """逆向分页抓取核心(优化分页逻辑)"""
+    processed_months = set()
+    # target_months = [(2023, 5), (2023, 4)]
+    page = 1
+    for year, month in target_months:
+        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+
+        current_page = 1
+        found_tables = 0
+
+        while True:
+            # 智能等待页面稳定
+            random_sleep(base=2, variance=3)
+
+            try:
+                # 动态检测当前页面月份
+                print(f"当前页面:{driver.current_url}, 第{page}页")
+
+                # 处理当前页面的表格数据
+                found = process_month_data(driver, year, month)
+                found_tables += found
+
+                if found_tables == 1000:
+                    print(f"❌{year}年{month}月数据采集失败,跳过当前月")
+                    break
+
+                # 完成四个表格采集
+                if found_tables >= 1:
+                    print(f"已完成{year}年{month}月全部表格采集")
+                    processed_months.add((year, month))
+                    break
+                print(f"第{page}页已采集表格数:{found_tables}/1,前往下一页采集")
+                # 分页操作(增强定位稳定性)
+                WebDriverWait(driver, 15).until(
+                    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
+                ).click()
+
+                current_page += 1
+                page += 1
+
+
+            except TimeoutException:
+                print(f"未找到更多分页,已采集表格数:{found_tables}/1")
+                break
+            except Exception as e:
+                print(f"分页异常:{str(e)}")
+                handle_retry(driver)  # 异常恢复函数
+                break
+
+    return processed_months
+
+
+
+def random_sleep(base=2, variance=5):
+    """智能随机等待"""
+    sleep_time = base + random.random() * variance
+    time.sleep(sleep_time)
+
+def handle_retry(driver):
+    """异常恢复处理"""
+    try:
+        driver.refresh()
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+        print("浏览器异常已恢复")
+    except:
+        print("需要人工干预的严重错误")
+        raise
+
+
+def main():
+    """主入口(优化参数处理逻辑)"""
+    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+    parser.add_argument('--year', type=int, default=None,
+                        help='终止年份(如2023),未指定时抓取最新两个月')
+    args = parser.parse_args()
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
+    try:
+        # 智能检测最新有效月份
+        valid_year, valid_month = detect_latest_month(driver)
+        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+
+        # 生成目标序列
+        if args.year:
+            # 指定年份时:从最新月到目标年1月
+            target_months = generate_month_sequence(
+                start_year=valid_year,
+                start_month=valid_month,
+                end_year=args.year,
+                skip_january=True
+
+            )
+        else:
+            # 未指定年份时:取最近两个月
+            target_months = generate_month_sequence(valid_year, valid_month)
+
+        print(f"目标采集月份序列:{target_months}")
+        reverse_crawler(driver, target_months)
+        print(f"{len(target_months)}个月份数据已采集完毕")
+
+    finally:
+        if 'driver' in locals():
+            driver.quit()
+        print("\n数据清洗入库中...")
+        traverse_and_process(download_dir, parse_excel, province_name="fujian")
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
guangdong/__init__.py


+ 11 - 0
guangdong/cross.log

@@ -0,0 +1,11 @@
+2025-05-28 17:43:25 - utils.log:267 - INFO - 【广州海关】数据采集开始……
+2025-05-28 17:43:26 - utils.log:83 - INFO - 【广州海关】最新月份数据 2025-4:2025年1-4月广州关区所辖7地市进出口综合统计资料
+2025-05-28 17:43:26 - utils.log:269 - INFO - 【广州海关】检测到最新有效数据:2025-04
+2025-05-28 17:43:26 - utils.log:281 - INFO - 【广州海关】目标采集月份序列:[(2025, 4), (2025, 3), (2025, 2), (2024, 12), (2024, 11), (2024, 10), (2024, 9), (2024, 8), (2024, 7), (2024, 6), (2024, 5), (2024, 4), (2024, 3), (2024, 2), (2023, 12), (2023, 11), (2023, 10), (2023, 9), (2023, 8), (2023, 7), (2023, 6), (2023, 5), (2023, 4), (2023, 3), (2023, 2)]
+2025-05-28 17:43:26 - utils.log:133 - INFO - ===============
+开始处理广州海关 2025年4月数据================
+2025-05-28 17:43:30 - utils.log:150 - INFO - 【广州海关】当前页面:http://guangzhou.customs.gov.cn/guangzhou_customs/381558/fdzdgknr33/381638/381572/381573/index.html, 第1页
+2025-05-28 17:43:36 - utils.log:123 - INFO - 本页找到1个有效表格
+2025-05-28 17:43:36 - utils.log:157 - INFO - 【广州海关】已完成2025年4月全部表格采集
+2025-05-28 17:43:36 - utils.log:133 - INFO - ===============
+开始处理广州海关 2025年3月数据================

+ 291 - 0
guangdong/selenium_guangdong_city.py

@@ -0,0 +1,291 @@
+import argparse
+import random
+import re
+import time
+from datetime import datetime, timedelta
+
+from selenium import webdriver
+from selenium.common import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from utils.constants import DOWNLOAD_DIR
+from utils.constants import GUANGDONG_CUSTOMS_URL
+from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel, download_excel2, \
+    batch_download_excel
+from utils.log import log
+
+download_dir = DOWNLOAD_DIR / "guangdong"
+
+
+
+def generate_target_title(check_year, check_month, customs_name):
+    """生成正则匹配的标题模式"""
+    global target_title
+    if customs_name == "广州海关":
+        return rf'{check_year}\s*年\s*(?:1[--]\s*)?{check_month}月广州关区所辖7地市进出口综合统计资料'
+    elif customs_name == "深圳海关":
+        return rf"{check_year}\s*年\s*(?:1[--]\s*)?{check_month}月(深圳海关|深圳关区)综合统计资料"
+    elif customs_name == "拱北海关":
+        return rf"\S+市{check_year}\s*年\s*(?:1[--]\s*)?{check_month}月对外贸易进出口统计表"
+    elif customs_name == "汕头海关":
+        return rf"5市报表{check_year}年(?:1[--]\s*{check_month}月|{check_month}月)(人民币)"
+    elif customs_name == "黄埔海关":
+        return rf"{check_year}年\s*(?:1[--]\s*)?{check_month}月东莞市进出口企业性质总值表"
+    elif customs_name == "江门海关":
+        if check_month == 3:
+            target_title = rf"{check_year}年\s*(?:一季度|前{check_month}个月|\s*{check_month}月)[\u4e00-\u9fa5]+市外贸进出口有关情况统计表(以人民币计价)"
+        elif check_month == 12:
+            target_title = rf"{check_year}年\s*(?:{check_month}月\s*)?.*外贸进出口有关情况统计表(以人民币计价)"
+        else:
+            target_title = rf"{check_year}年\s*前?{check_month}个?月.*外贸进出口有关情况统计表(以人民币计价)"
+        return target_title
+    elif customs_name == "湛江海关":
+        if check_month == 3:
+            target_title = rf"{check_year}年\s*(?:一季度|前3个月|3月).*外贸进出口数据"
+        elif check_month == 9:
+            target_title = rf"{check_year}年\s*(?:前三季度|前9个月|9月).*外贸进出口数据"
+        elif check_month == 12:
+            rf'^{check_year}年(?:及{check_month}月份)?(?:湛江市、茂名市).*外贸进出口数据'
+        else:
+            target_title = rf"{check_year}年\s*前?{check_month}个?月.*(外贸)?进出口数据"
+        return target_title
+    else:
+        return rf"{check_year}\s*年\s*(?:1[--]\s*)??{check_month}月{customs_name}进出口综合统计资料"
+
+
+
+
+def detect_latest_month(driver,customs_name):
+    """三级回溯智能检测最新有效月份"""
+    current_date = datetime.now()
+    for offset in range(0, 3):
+        check_date = current_date - timedelta(days=offset * 30)
+        check_year = check_date.year
+        check_month = check_date.month
+
+        # 根据海关名称生成对应的标题
+        target_title = generate_target_title(check_year, check_month, customs_name)
+
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+            )
+
+            # 获取所有 <a> 标签
+            links = driver.find_elements(By.XPATH, '//a[@title]')
+
+            # 使用 Python 正则匹配 title
+            for link in links:
+                title = link.get_attribute('title')
+                if re.search(target_title, title, re.IGNORECASE):
+                    log.info(f"【{customs_name}】最新月份数据 {check_year}-{check_month}:{title}")
+                    return check_year, check_month
+
+        except Exception as e:
+            log.info(f"未找到 {target_title}: {e}")
+            continue
+    raise Exception("三个月内未找到有效数据")
+
+def process_month_data(driver, year, month, customs_name, max_retries=3):
+    """带重试机制的表格数据处理"""
+    target_title = generate_target_title(year, month, customs_name)
+    found_count = 0
+
+    links = driver.find_elements(By.XPATH, '//a[@title]')
+
+    for link in links:
+        try:
+            title = link.get_attribute('title')
+            if re.search(target_title, title, re.IGNORECASE):
+                # log.info(f"【{customs_name}】匹配到目标: {title}")
+                url = link.get_attribute("href")
+
+                for attempt in range(max_retries):
+                    try:
+                        if customs_name in ['汕头海关', '江门海关']:
+                            download_excel2(driver, link, year, month, title, download_dir)
+                        elif customs_name == "湛江海关":
+                            batch_download_excel(driver, url, year, month, title, download_dir)
+                        else:
+                            download_excel(driver, url, year, month, title, download_dir)
+                        found_count += 1
+                        time.sleep(random.uniform(0.5, 1.5))  # 下载间隔
+                        break
+                    except Exception as e:
+                        log.info(f"【{customs_name}】第 {attempt + 1} 次重试失败: {str(e)}")
+                        if attempt + 1 == max_retries:
+                            log.info(f"【{customs_name}】已达最大重试次数,放弃采集: {title}")
+        except Exception as e:
+            log.info(f"无法获取 title 属性: {e}")
+
+    log.info(f"本页找到{found_count}个有效表格")
+    return found_count
+
+
+def reverse_crawler(driver, target_months,  customs_name):
+    """逆向分页抓取核心(优化分页逻辑)"""
+    processed_months = set()
+    # target_months = [(2023, 5), (2023, 4)]
+    page = 1
+    for year, month in target_months:
+        log.info(f"\n开始处理{customs_name} {year}年{month}月数据".center(50, "="))
+
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+
+        found_tables = 0
+        table_nums = 1
+
+        if  customs_name == "拱北海关" or customs_name == "江门海关":
+            table_nums = 2
+
+        while True:
+            # 智能等待页面稳定
+            random_sleep(base=2, variance=3)
+
+            try:
+                log.info(f"【{customs_name}】当前页面:{driver.current_url}, 第{page}页")
+                # 处理当前页面的表格数据
+                found = process_month_data(driver, year, month ,customs_name)
+                found_tables += found
+
+                # 完成四个表格采集
+                if found_tables >= table_nums:
+                    log.info(f"【{customs_name}】已完成{year}年{month}月全部表格采集")
+                    processed_months.add((year, month))
+                    break
+
+                log.info(f"【{customs_name}】第{page}页已采集表格数:{found_tables}/{table_nums},前往下一页采集")
+                # 分页操作(增强定位稳定性)
+                WebDriverWait(driver, 15).until(
+                    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
+                ).click()
+
+                page += 1
+
+
+            except TimeoutException:
+                log.info(f"未找到更多分页,已采集表格数:{found_tables}/{table_nums}")
+                break
+            except Exception as e:
+                log.info(f"分页异常:{str(e)}")
+                handle_retry(driver)  # 异常恢复函数
+                break
+
+    return processed_months
+
+def handle_retry(driver):
+    """异常恢复处理"""
+    try:
+        driver.refresh()
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+        log.info("浏览器异常已恢复")
+    except:
+        log.info("需要人工干预的严重错误")
+        raise
+
+
+def random_sleep(base=2, variance=5):
+    """智能随机等待"""
+    sleep_time = base + random.random() * variance
+    time.sleep(sleep_time)
+
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+# def process_customs(customs_name, args):
+#     """处理单个海关的数据抓取任务"""
+#     options = configure_stealth_options(download_dir)
+#     driver = webdriver.Firefox(options=options)
+#
+#     try:
+#         driver.get(GUANGDONG_CUSTOMS_URL[customs_name])
+#         valid_year, valid_month = detect_latest_month(driver, customs_name)
+#         log.info(f"检测到{customs_name}最新有效数据:{valid_year}-{valid_month:02d}")
+#
+#         if customs_name in ['汕头海关', '拱北海关', '江门海关']:
+#             skip_january = False
+#         else:
+#             skip_january = True
+#
+#         if args.year:
+#             target_months = generate_month_sequence(valid_year, valid_month, args.year, skip_january)
+#         else:
+#             target_months = generate_month_sequence(valid_year, valid_month)
+#
+#         log.info(f"目标采集月份序列:{target_months}")
+#         reverse_crawler(driver, target_months, customs_name)
+#         log.info(f"{customs_name} {len(target_months)}个月份数据已采集完毕")
+#         return customs_name, True
+#     except Exception as e:
+#         log.info(f"[错误] 采集失败:{customs_name} - {str(e)}")
+#         return customs_name, False
+#     finally:
+#         driver.quit()
+#
+#
+# def main():
+#     parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+#     parser.add_argument('--year', type=int, default=None,
+#                         help='终止年份(如2023),未指定时抓取最新两个月')
+#     args = parser.parse_args()
+#
+#     customs_list = GUANGDONG_CUSTOMS_URL.keys()
+#
+#     # 使用线程池并发采集
+#     with ThreadPoolExecutor(max_workers=3) as executor:
+#         futures = []
+#         for customs_name in customs_list:
+#             future = executor.submit(process_customs, customs_name, args)
+#             futures.append(future)
+#
+#         for future in as_completed(futures):
+#             customs_name, success = future.result()
+#             if success:
+#                 log.info(f"[完成] {customs_name} 数据采集成功")
+#             else:
+#                 log.info(f"[失败] {customs_name} 数据采集失败")
+#
+#     log.info("\n广东省所有海关数据采集完成。")
+
+def main():
+    """主入口(优化参数处理逻辑)"""
+    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+    parser.add_argument('--year', type=int, default=None,
+                        help='终止年份(如2023),未指定时抓取最新两个月')
+    args = parser.parse_args()
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
+    for customs_name in GUANGDONG_CUSTOMS_URL.keys():
+        try:
+            driver.get(GUANGDONG_CUSTOMS_URL[customs_name])
+            log.info(f"【{customs_name}】数据采集开始……")
+            valid_year, valid_month = detect_latest_month(driver, customs_name)
+            log.info(f"【{customs_name}】检测到最新有效数据:{valid_year}-{valid_month:02d}")
+
+            if customs_name in ['汕头海关', '拱北海关']:
+                skip_january = False
+            else:
+                skip_january = True
+
+            if args.year:
+                target_months = generate_month_sequence(valid_year, valid_month, args.year, skip_january)
+            else:
+                target_months = generate_month_sequence(valid_year, valid_month)
+
+            log.info(f"【{customs_name}】目标采集月份序列:{target_months}")
+            reverse_crawler(driver, target_months, customs_name)
+            log.info(f"【{customs_name}】{len(target_months)}个月份数据已采集完毕".center(66, "="))
+        finally:
+            pass  # 保留driver.quit()在最后统一处理
+
+    driver.quit()
+    log.info("\n所有数据清洗入库中...")
+
+if __name__ == "__main__":
+    main()

+ 440 - 0
guangdong/selenium_guangdong_download.py

@@ -0,0 +1,440 @@
+import argparse
+import argparse
+import random
+import re
+import time
+from datetime import datetime, timedelta
+
+import pandas as pd
+from selenium import webdriver
+from selenium.common import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from db_helper import DBHelper
+from utils.constants import DOWNLOAD_DIR, COUNTRY_CODE_MAPPING
+from utils.download_utils import configure_stealth_options, generate_month_sequence
+from utils.parse_utils import clean_county_name, convert_wan_to_yuan, clean_commodity_name
+
+# 基础配置
+
+
+BASE_URL = "http://gdfs.customs.gov.cn/guangdong_sub/zwgk62/sjgb59/6b4cdb3f-1.html"
+download_dir = DOWNLOAD_DIR / "guangdong"
+
+
+
+PROV_CODE = "440000"
+PROV_NAME = "广东省"
+
+db = DBHelper()
+
+
+
+def detect_latest_month(driver):
+    """三级回溯智能检测最新有效月份(修正年/月匹配逻辑)"""
+    driver.get(BASE_URL)
+    current_date = datetime.now()
+
+    for offset in range(0, 3):
+        check_date = current_date - timedelta(days=offset * 30)
+        check_year = check_date.year
+        check_month = check_date.month
+
+        # 构建正则表达式:兼容“1至X月”和“X月”两种格式,并允许前后有空格
+        pattern = re.compile(
+            rf'(5){check_year}\s*年\s*(?:1至)?{check_month}\s*月广东省外贸进出口主要国别(地区)总值表(人民币值)',
+            re.IGNORECASE
+        )
+
+        try:
+            # 使用 Python 端的正则匹配所有含“广东省”的链接 title
+            elements = WebDriverWait(driver, 10).until(
+                EC.presence_of_all_elements_located((By.XPATH, '//a[contains(@title,"广东省")]'))
+            )
+
+            for element in elements:
+                title = element.get_attribute("title")
+                if pattern.search(title):
+                    print(f"已找到最新月份数据 {check_year}-{check_month}")
+                    return check_year, check_month
+
+            print(f"未找到匹配项(正则:{pattern.pattern})")
+        except TimeoutException:
+            print(f"页面加载超时或无匹配项({check_year}-{check_month})")
+            continue
+
+    raise Exception("三个月内未找到有效数据")
+
+
+def process_month_data(driver, year, month):
+    """处理月度数据:支持三种表格类型"""
+    patterns = [
+        (re.compile(rf'(5){year}\s*年\s*(1-)?{month}\s*月广东省外贸进出口主要国别(地区)总值表(人民币值)'), 'country'),
+        (re.compile(rf'(6){year}\s*年\s*(1-)?{month}\s*月广东省出口重点商品总值表(人民币值)'), 'export_commodity'),
+        (re.compile(rf'(7){year}\s*年\s*(1-)?{month}\s*月广东省进口重点商品总值表(人民币值)'), 'import_commodity')
+    ]
+
+    found_count = 0
+    commodity_data = {'export': [], 'import': []}  # 存储商品数据等待合并
+    links = driver.find_elements(By.XPATH, '//a[contains(@title,"广东省")]')
+
+    for link in links:
+        title = link.get_attribute("title")
+        for pattern, table_type in patterns:
+            if pattern.search(title):
+                print(f"处理表格: {title}")
+                url = link.get_attribute("href")
+
+                # 新标签页处理
+                driver.execute_script("window.open(arguments[0]);", url)
+                driver.switch_to.window(driver.window_handles[-1])
+                try:
+                    WebDriverWait(driver, 15).until(
+                        EC.presence_of_element_located((By.XPATH, "//table[@border='1']"))
+                    )
+
+                    # 根据表格类型处理数据
+                    if table_type == 'country':
+                        data = parse_country_table(driver, year, month)
+                        df_country = pd.DataFrame(data)
+                        db.bulk_insert(
+                            df_country,
+                            't_yujin_crossborder_prov_country_trade',
+                            conflict_columns=['crossborder_year_month', 'prov_code', 'country_code'],
+                            update_columns=['monthly_total', 'monthly_import', 'monthly_export',
+                                            'yoy_import_export', 'yoy_import', 'yoy_export']
+                        )
+
+                        found_count += 1
+                    else:
+                        data_type = 'export' if table_type == 'export_commodity' else 'import'
+                        commodity_data[data_type] = parse_commodity_table(driver, data_type, year, month)
+                        found_count += 1
+                except Exception as e:
+                    print(f"表格处理失败: {e}")
+
+    # 将数据返回,而不是在内部合并
+    return found_count, commodity_data
+
+
+
+def parse_country_table(driver, year, month):
+    """解析目标页面的表格数据"""
+    data = []
+
+    try:
+        # 等待表格加载
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.XPATH, "//table[@border='1']"))
+        )
+        table = driver.find_element(By.XPATH, "//table[@border='1' and @bordercolor='#000000']")
+        rows = table.find_elements(By.TAG_NAME, 'tr')
+
+        # 检测表格的列数
+        header_row = rows[2]  # 假设表头在第3行
+        header_row.find_elements(By.TAG_NAME, 'td')
+
+
+        # 数据行从第4行开始(跳过表头)
+        for row in rows[4:]:
+            cols = [col.text.strip() for col in row.find_elements(By.TAG_NAME, 'td')]
+            country_name = cols[0]
+
+            if (country_name == '广东外贸总值' or
+                    country_name == '东盟' or
+                    country_name == '欧盟' or
+                    country_name == '总计' or
+                    country_name == '广东外贸总额' or
+                    country_name == '广东外贸总计' or
+                    country_name == '总值'
+            ):
+                continue
+
+            if month == 2:
+                # 处理合并后的1月和2月数据
+                monthly_total = convert_wan_to_yuan(cols[1])
+                monthly_export = convert_wan_to_yuan(cols[3])
+                monthly_import = convert_wan_to_yuan(cols[5])
+
+                # 将2月的数据除以2,并生成1月和2月的数据
+                for m in [1, 2]:
+                    adjusted_monthly_total = monthly_total / 2
+                    adjusted_monthly_export = monthly_export / 2
+                    adjusted_monthly_import = monthly_import / 2
+                    adjusted_yoy_total = 0
+                    adjusted_yoy_export = 0
+                    adjusted_yoy_import = 0
+
+                    country_name_clean = clean_county_name(country_name)
+                    country_code = COUNTRY_CODE_MAPPING.get(country_name_clean)
+
+                    data.append({
+                        'crossborder_year': year,
+                        'crossborder_year_month': f"{year}-{m:02d}",
+                        'prov_code': PROV_CODE,
+                        'prov_name': PROV_NAME,
+                        'country_code': country_code,
+                        'country_name': country_name_clean,
+                        'monthly_total': adjusted_monthly_total,
+                        'monthly_export': adjusted_monthly_export,
+                        'monthly_import': adjusted_monthly_import,
+                        'yoy_import_export': adjusted_yoy_total,
+                        'yoy_export': adjusted_yoy_export,
+                        'yoy_import': adjusted_yoy_import
+                    })
+            else:
+                # 原逻辑处理13列的情况
+                monthly_total = convert_wan_to_yuan(cols[3])
+                monthly_export = convert_wan_to_yuan(cols[7])
+                monthly_import = convert_wan_to_yuan(cols[11])
+                yoy_total = parse_number(cols[4])
+                yoy_export = parse_number(cols[8])
+                yoy_import = parse_number(cols[12])
+
+                country_name_clean = clean_county_name(country_name)
+                country_code = COUNTRY_CODE_MAPPING.get(country_name_clean)
+
+                data.append({
+                    'crossborder_year': year,
+                    'crossborder_year_month': f"{year}-{month:02d}",
+                    'prov_code': PROV_CODE,
+                    'prov_name': PROV_NAME,
+                    'country_code': country_code,
+                    'country_name': country_name_clean,
+                    'monthly_total': monthly_total,
+                    'monthly_export': monthly_export,
+                    'monthly_import': monthly_import,
+                    'yoy_import_export': yoy_total,
+                    'yoy_export': yoy_export,
+                    'yoy_import': yoy_import
+                })
+
+    except Exception as e:
+        print(f"解析表格失败: {e}")
+        raise
+    finally:
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+    return data
+
+
+
+def parse_commodity_table(driver, data_type, year, month):
+    """解析商品表通用函数"""
+    data = []
+    try:
+        table = driver.find_element(By.XPATH, "//table[@border='1' and @bordercolor='#000000']")
+        rows = table.find_elements(By.TAG_NAME, 'tr')
+
+        # 检测表格的列数
+        header_row = rows[2]  # 假设表头在第3行
+        cols = header_row.find_elements(By.TAG_NAME, 'td')
+        num_cols = len(cols)
+
+        # 数据行从第4行开始(跳过表头)
+        for row in rows[4:]:
+            cols = [col.text.strip() for col in row.find_elements(By.TAG_NAME, 'td')]
+            if len(cols) < 3:
+                continue
+
+            # 清洗商品名称(处理&nbsp;和空格)
+            name = clean_commodity_name(cols[0])
+
+            if month == 2:
+                # 处理合并后的1月和2月数据
+                value = convert_wan_to_yuan(cols[1])
+
+
+                # 将2月的数据除以2,并生成1月和2月的数据
+                for m in [1, 2]:
+                    adjusted_value = value / 2
+                    adjusted_yoy = 0  # 同比置为0
+
+                    data.append({
+                        'commodity_name': name,
+                        'commodity_code': db.get_commodity_id(name),
+                        'monthly_export' if data_type == 'export' else 'monthly_import': adjusted_value,
+                        'crossborder_year_month': f"{year}-{m:02d}"
+                    })
+            else:
+                # 原逻辑处理5列的情况
+                value = convert_wan_to_yuan(cols[3] if data_type == 'export' else cols[3])
+
+
+                data.append({
+                    'commodity_name': name,
+                    'commodity_code': db.get_commodity_id(name),
+                    'monthly_export' if data_type == 'export' else 'monthly_import': value,
+                    'crossborder_year_month': f"{year}-{month:02d}"
+                })
+
+    except Exception as e:
+        print(f"解析商品表失败: {e}")
+        raise
+    finally:
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+    return data
+
+
+def merge_commodity_data(import_data, export_data, year, month):
+    """
+    根据commodity_code合并进出口数据(支持不同商品的存在情况)
+    :param year:
+    :param month:
+    :param import_data: 进口数据列表(含commodity_code)
+    :param export_data: 出口数据列表(含commodity_code)
+    :return: 合并后的DataFrame
+    """
+    # 转换数据为DataFrame
+    df_import = pd.DataFrame(import_data)
+    df_export = pd.DataFrame(export_data)
+
+    # 合并逻辑(全外连接保留所有商品)
+    merged_df = pd.merge(
+        df_import,
+        df_export,
+        on=['commodity_code',  'commodity_name',  'crossborder_year_month'],
+        how='outer'
+    ).fillna(0)
+
+    # 计算总量(可选,根据表结构需求)
+    merged_df['monthly_total'] = merged_df['monthly_import'] + merged_df['monthly_export']
+    merged_df['crossborder_year'] = year
+    merged_df['crossborder_year_month'] = f"{year}-{month:02d}"
+    merged_df['prov_code'] = PROV_CODE
+    merged_df['prov_name'] = PROV_NAME
+
+    return merged_df
+
+def parse_number(text):
+    """转换文本为浮点数(处理空值、负号)"""
+    text = text.strip().replace(',', '')
+    if not text or text == '-':
+        return None
+    try:
+        return float(text)
+    except ValueError:
+        return None
+
+
+# 优化后的代码逻辑:
+
+def reverse_crawler(driver, target_months):
+    """逆向分页抓取核心逻辑"""
+    processed_months = set()
+    page = 1
+    for year, month in target_months:
+        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, "conList_ul")))
+
+        current_page = 1
+        found_tables = 0
+        export_data = []
+        import_data = []
+
+        while True:
+            random_sleep(base=2, variance=3)
+            try:
+                print(f"当前页面:{driver.current_url}, 第{page}页")
+                found, commodity_data = process_month_data(driver, year, month)
+                found_tables += found
+
+                # 累积商品数据
+                if commodity_data['export']:
+                    export_data.extend(commodity_data['export'])
+                if commodity_data['import']:
+                    import_data.extend(commodity_data['import'])
+
+                # 完成三个表格采集
+                if found_tables >= 3:
+                    # 确保同时有进口和出口数据
+                    if export_data and import_data:
+                        final_df = merge_commodity_data(export_data, import_data, year, month)
+                        db.bulk_insert(df=final_df, table_name='t_yujin_crossborder_prov_commodity_trade',
+                                      conflict_columns=['commodity_code', 'crossborder_year_month'],
+                                      update_columns=['monthly_import', 'monthly_export', 'monthly_total'])
+                    print(f"已完成{year}年{month}月全部表格采集")
+                    processed_months.add((year, month))
+                    break
+
+                print(f"第{page}页已采集表格数:{found_tables}/3,前往下一页采集")
+
+                # 分页点击逻辑
+                WebDriverWait(driver, 15).until(
+                    EC.element_to_be_clickable((By.XPATH, '//a[@class="pagingNormal next"]'))
+                ).click()
+
+                current_page += 1
+                page += 1
+
+            except TimeoutException:
+                print(f"未找到更多分页,已采集表格数:{found_tables}/3")
+                break
+            except Exception as e:
+                print(f"分页异常:{str(e)}")
+                handle_retry(driver)
+                break
+
+    return processed_months
+
+
+
+def random_sleep(base=2, variance=5):
+    """智能随机等待"""
+    sleep_time = base + random.random() * variance
+    time.sleep(sleep_time)
+
+def handle_retry(driver):
+    """异常恢复处理"""
+    try:
+        driver.refresh()
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+        print("浏览器异常已恢复")
+    except:
+        print("需要人工干预的严重错误")
+        raise
+
+
+def main():
+    """主入口(优化参数处理逻辑)"""
+    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+    parser.add_argument('--year', type=int, default=None,
+                        help='终止年份(如2023),未指定时抓取最新两个月')
+    args = parser.parse_args()
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
+    try:
+        # 智能检测最新有效月份
+        valid_year, valid_month = detect_latest_month(driver)
+        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+
+        # 生成目标序列
+        if args.year:
+            # 指定年份时:从最新月到目标年1月
+            target_months = generate_month_sequence(
+                start_year=valid_year,
+                start_month=valid_month,
+                end_year=args.year,
+                skip_january=True
+            )
+        else:
+            # 未指定年份时:取最近两个月
+            target_months = generate_month_sequence(valid_year, valid_month)
+
+        print(f"目标采集月份序列:{target_months}")
+        reverse_crawler(driver, target_months)
+        print(f"{len(target_months)}个月份数据已采集完毕")
+
+    finally:
+        driver.quit()
+        print("\n数据清洗入库中...")
+
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
henan/__init__.py


+ 265 - 0
henan/henan_parse_excel.py

@@ -0,0 +1,265 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from db_helper import DBHelper
+from utils.constants import COUNTRY_CODE_MAPPING, EXCLUDE_REGIONS, DOWNLOAD_DIR
+from utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, find_unmatched_countries, \
+    extract_year_month_from_path, traverse_and_process
+
+# 常量配置(新增路径正则校验)
+PROV_CODE = "410000"
+PROV_NAME = "河南省"
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+download_dir = DOWNLOAD_DIR / "henan"
+
+
+def parse_excel(current_dir):
+    """主解析入口(优化为单参数模式)
+    Args:
+        current_dir (str): 当前月份数据目录(格式:/年份/省份/月份)
+    """
+    current_path = Path(current_dir)
+    year, month = extract_year_month_from_path(current_path)
+
+    try:
+
+        # 处理商品贸易数据
+        process_combined_trade(current_path, year, month)
+
+        # 处理国别贸易数据(保持原有逻辑结构)
+        country_file = next(current_path.glob("*国别*"), None)
+        process_country_trade(country_file, year, month)
+
+        print(f"{current_dir}数据已全部成功处理")
+    except Exception as e:
+        print(f"处理失败:{current_dir},错误:{str(e)}")
+        raise
+
+
+def process_combined_trade(current_dir, year, month):
+    """处理合并商品贸易数据(支持1月数据模拟)"""
+    import_file = next(current_dir.glob("*进口主要商品量值表*"), None)
+    export_file = next(current_dir.glob("*出口主要商品量值表*"), None)
+    if not (import_file and export_file):
+        raise FileNotFoundError("缺少进口或出口文件")
+
+    # 读取当前月数据
+    current_data = read_trade_pair(import_file, export_file)
+
+    db = DBHelper()
+
+    current_data['commodity_code'] = current_data['commodity_name'].apply(db.get_commodity_id)
+    valid_data = current_data[current_data['commodity_code'].notnull()].copy()
+
+    # 构建当前月数据
+    valid_data['crossborder_year'] = year
+    valid_data['crossborder_year_month'] = f"{year}-{month:02d}"
+    valid_data['prov_code'] = PROV_CODE
+    valid_data['prov_name'] = PROV_NAME
+    valid_data['monthly_total'] = valid_data['monthly_import'] + valid_data['monthly_export']
+
+    # 定义目标字段
+    target_cols = [
+        'crossborder_year', 'crossborder_year_month', 'prov_code', 'prov_name',
+        'commodity_code', 'commodity_name', 'monthly_total', 'monthly_import', 'monthly_export'
+    ]
+
+    # 写入当前月数据
+    db.bulk_insert(
+        valid_data[target_cols],
+        't_yujin_crossborder_prov_commodity_trade',
+        conflict_columns=['crossborder_year_month', 'prov_code', 'commodity_code'],
+        update_columns=['monthly_total', 'monthly_import', 'monthly_export']
+    )
+
+    # 当处理2月数据时,生成模拟1月数据
+    if month == 2:
+        # 克隆当前数据并调整月份
+        january_data = valid_data.copy()
+        january_data['crossborder_year_month'] = f"{year}-01"
+
+        # 数值处理:月指标除以2(模拟1-2月均值)
+        numeric_cols = ['monthly_total', 'monthly_import', 'monthly_export']
+        january_data[numeric_cols] = january_data[numeric_cols] / 2
+
+        # 写入模拟1月数据
+        db.bulk_insert(
+            january_data[target_cols],
+            't_yujin_crossborder_prov_commodity_trade',
+            conflict_columns=['crossborder_year_month', 'prov_code', 'commodity_code'],
+            update_columns=numeric_cols  # 仅更新数值字段
+        )
+
+
+def process_country_trade(current_file_path, year, month):
+    """处理国别贸易数据(支持1月数据模拟)"""
+    # 读取原始数据
+    final_df = read_with_header4(current_file_path, month)
+
+    # 数据清洗:剔除指定区域
+    final_df = final_df[
+        ~final_df['country_name'].isin(EXCLUDE_REGIONS) &
+        ~final_df['country_name'].str.contains(r'[((]地区[))]', regex=True)  # 修正正则表达式
+        ]
+
+    # 生成基础字段
+    final_df['country_code'] = final_df['country_name'].map(COUNTRY_CODE_MAPPING)
+
+    find_unmatched_countries(final_df)
+
+    # 过滤掉没有匹配到 country_code 的行
+    final_df = final_df[final_df['country_code'].notnull()].copy()
+    final_df['crossborder_year'] = year
+    final_df['crossborder_year_month'] = f"{year}-{month:02d}"
+    final_df['prov_code'] = PROV_CODE
+    final_df['prov_name'] = PROV_NAME
+
+    # 主数据写入
+    db = DBHelper()
+
+    db.bulk_insert(
+        final_df,
+        't_yujin_crossborder_prov_country_trade',
+        conflict_columns=['crossborder_year_month', 'prov_code', 'country_code'],
+        update_columns=['monthly_total', 'monthly_import', 'monthly_export',
+                        'yoy_import_export', 'yoy_import', 'yoy_export']
+    )
+
+    # 当处理2月数据时,生成模拟1月数据
+    if month == 2:
+        # 克隆数据并调整月份
+        january_df = final_df.copy()
+        january_df['crossborder_year_month'] = f"{year}-01"
+
+        # 数值处理:月指标除以2,同比指标清零
+        numeric_cols = ['monthly_total', 'monthly_import', 'monthly_export']
+        january_df[numeric_cols] = january_df[numeric_cols] / 2  # 均摊为1-2月均值
+
+        yoy_cols = ['yoy_import_export', 'yoy_import', 'yoy_export']
+        january_df[yoy_cols] = 0.0  # 模拟数据无同比
+
+        # 模拟数据写入(增加注释说明)
+        db.bulk_insert(
+            january_df,
+            't_yujin_crossborder_prov_country_trade',
+            conflict_columns=['crossborder_year_month', 'prov_code', 'country_code'],
+            update_columns=numeric_cols + yoy_cols  # 仅更新数值字段
+        )
+
+
+def read_with_header4(file_path, month):
+    # 第一阶段:读取原始数据(固定列范围)
+    # 2月份数据和其他月份表格数据不同
+    if month == 2:
+        target_cols = [0, 1, 2, 3, 4, 5, 6]
+    else:
+        raw_df = pd.read_excel(
+            file_path,
+            usecols="A:K",  # 强制读取前11列(A到K)
+            header=None,  # 禁用自动表头识别
+            skiprows=5,
+            skipfooter=1
+        )
+
+        # 第二阶段:计算列偏移量
+        if raw_df.iloc[:, 0:2].isnull().all().all():  # 前两列全为空
+            col_offset = 2  # 从第三列开始(A3起始)
+        else:
+            col_offset = 0  # 默认从第一列开始(A1起始)
+
+        # 第三阶段:确定目标列索引(基于偏移后的位置)
+        target_cols = [0 + col_offset, 1 + col_offset, 2 + col_offset, 5 + col_offset, 6 + col_offset, 9 + col_offset,
+                       10 + col_offset]
+
+    # 第四阶段:应用header=4逻辑并选择目标列
+    final_df = pd.read_excel(
+        file_path,
+        usecols=target_cols,  # 动态选择的目标列
+        header=4,  # 保持原有header行位置
+        skipfooter=1
+    )
+
+    # 第五阶段:强制列名对齐
+    final_df.columns = [
+        'country_name', 'monthly_total', 'yoy_import_export',
+        'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import'
+    ]
+
+    # 清洗国家名称
+    final_df['country_name'] = final_df['country_name'].apply(clean_county_name)
+
+    # 替换 "--" 为 0,并转换为 float 类型
+    yoy_columns = ['yoy_import_export', 'yoy_export', 'yoy_import']
+    # 优化后的稳健类型转换方案
+    final_df[yoy_columns] = (
+        final_df[yoy_columns]
+        # 阶段1:清理非常规占位符
+        .replace({
+            '--': None,  # 处理横杠
+            '': None,  # 处理空字符串
+            'N/A': None,  # 处理英文占位符
+            '不详': None  # 处理中文占位符
+        })
+        # 阶段2:安全类型转换
+        .apply(pd.to_numeric, errors='coerce', downcast='float')
+        # 阶段3:空值处理
+        .fillna(0)
+        # 阶段4:精度控制
+        .round(2)
+    )
+
+    return final_df
+
+
+
+# 进出口数据合并为一张表
+def read_trade_pair(import_path, export_path):
+    """进出口合并,读取第一列和第4列"""
+    df_import = pd.read_excel(
+        import_path,
+        skiprows=3,
+        skipfooter=1,
+        usecols=[0, 4],
+        names=["commodity_name", "monthly_import"],
+        header=None
+    ).pipe(lambda df: df.assign(
+        commodity_name=df["commodity_name"].apply(clean_commodity_name)
+    ))
+
+    df_export = pd.read_excel(
+        export_path,
+        skiprows=3,
+        skipfooter=1,
+        usecols=[0, 4],
+        names=["commodity_name", "monthly_export"],
+        header=None
+    ).pipe(lambda df: df.assign(
+        commodity_name=df["commodity_name"].apply(clean_commodity_name)
+    ))
+
+    merged = pd.merge(df_import, df_export, on="commodity_name", how="outer").fillna(0)
+    merged["monthly_import"] = merged["monthly_import"].apply(convert_wan_to_yuan)
+    merged["monthly_export"] = merged["monthly_export"].apply(convert_wan_to_yuan)
+    return merged
+
+
+def calculate_monthly_values(current_data, prev_data):
+    """"""
+    merged = pd.merge(current_data, prev_data, on="commodity_name",
+                      how="left", suffixes=("_current", "_prev")).fillna(0)
+    merged["monthly_import"] = merged["monthly_import_current"] - merged["monthly_import_prev"]
+    merged["monthly_export"] = merged["monthly_export_current"] - merged["monthly_export_prev"]
+    return merged[["commodity_name", "monthly_import", "monthly_export"]]
+
+
+# def clean_commodity_name(name):
+#     return re.sub(r'[^\w\u4e00-\u9fa5]', '', str(name)).strip()
+
+
+
+
+if __name__ == "__main__":
+    traverse_and_process(download_dir, parse_excel, province_name="henan")

+ 233 - 0
henan/selenium_henan_download.py

@@ -0,0 +1,233 @@
+import argparse
+import argparse
+import random
+import re
+import time
+import traceback
+from datetime import datetime, timedelta
+
+from selenium import webdriver
+from selenium.common import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from henan.henan_parse_excel import parse_excel
+from utils.constants import DOWNLOAD_DIR
+from utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence
+from utils.parse_utils import traverse_and_process
+
+# 基础配置
+
+MAX_RETRY = 3
+DOWNLOAD_TIMEOUT = 60
+BASE_URL = "http://zhengzhou.customs.gov.cn/zhengzhou_customs/zfxxgk97/2967383/2967458/501407/0e9d768a-1.html"
+download_dir = DOWNLOAD_DIR / "henan"
+
+
+
+
+
+def detect_latest_month(driver):
+    """三级回溯智能检测最新有效月份(使用正则简化匹配)"""
+    driver.get(BASE_URL)
+    current_date = datetime.now()
+
+    for offset in range(0, 3):
+        check_date = current_date - timedelta(days=offset * 30)
+        check_year = check_date.year
+        check_month = check_date.month
+
+        # 构建正则表达式:兼容“1至X月”和“X月”两种格式,并允许年/月前后有空格
+        pattern = re.compile(
+            rf'{check_year}\s*年\s*(1至)?{check_month}\s*月\s*河南省进出口商品国别\(地区\)总值表',
+            re.IGNORECASE
+        )
+
+        try:
+            # 使用 Python 端的正则匹配所有链接 title
+            elements = WebDriverWait(driver, 10).until(
+                EC.presence_of_all_elements_located((By.XPATH, '//a'))
+            )
+            for element in elements:
+                title = element.get_attribute("title")
+                if pattern.search(title):
+                    print(f"已找到最新月份数据 {check_year}-{check_month}")
+                    return check_year, check_month
+
+            print(f"未找到匹配项(正则:{pattern.pattern})")
+        except TimeoutException:
+            print(f"页面加载超时或无匹配项({check_year}-{check_month})")
+            continue
+
+    raise Exception("三个月内未找到有效数据")
+
+
+
+def process_month_data(driver, year, month):
+    """兼容多种格式,确保三种表格都能识别并下载"""
+    # 定义三类目标标题模板
+    title_templates = [
+        f"{year}年1至{month}月河南省出口主要商品量值表",
+        f"{year}年1至{month}月河南省进口主要商品量值表",
+        f"{year}年1至{month}月河南省进出口商品国别(地区)总值表"
+    ]
+
+    # 构建正则匹配模板(支持“年X月”、“年1至X月”,并允许前后有空格)
+    patterns = [
+        re.compile(
+            rf'{year}\s*年\s*(1至)?{month}\s*月\s*河南省(?:出口主要商品|进口主要商品|进出口商品国别[$(|$(]地区[$)|$)])(量值表|总值表)',
+            re.IGNORECASE
+        )
+        for _ in [month]
+    ]
+
+    found_count = 0
+    links = driver.find_elements(By.XPATH, '//a[contains(@title,"河南省")]')
+
+    for link in links:
+        title = link.get_attribute("title")
+
+        if any(pattern.search(title) for pattern in patterns):
+            retry = 0
+            max_retries = 3  # 最大重试次数
+            success = False
+
+            while retry < max_retries and not success:
+                try:
+                    url = link.get_attribute("href")
+                    download_excel(driver, url, year, month, title, download_dir)
+                    found_count += 1
+                    time.sleep(random.uniform(0.5, 1.5))  # 下载间隔
+                    success = True  # 成功则跳出循环
+                except Exception as e:
+                    retry += 1
+                    print(f"下载 {title} 失败(第{retry}次重试): {e}")
+                    traceback.print_exc()
+                    if retry < max_retries:
+                        time.sleep(random.uniform(2, 5))  # 随机等待后再试
+                    else:
+                        print(f"{title} 下载已达到最大重试次数,跳过该文件。")
+
+    print(f"本页找到{found_count}个有效表格")
+    return found_count
+
+
+def reverse_crawler(driver, target_months):
+    """逆向分页抓取核心逻辑"""
+    processed_months = set()
+    # target_months = [(2023, 5), (2023, 4)]
+    page = 1
+    for year, month in target_months:
+        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+
+        current_page = 1
+        found_tables = 0
+
+        while True:
+            # 智能等待页面稳定
+            random_sleep(base=2, variance=3)
+
+            try:
+                print(f"当前页面:{driver.current_url}, 第{page}页")
+                # 处理当前页面的表格数据
+                found = process_month_data(driver, year, month)
+                found_tables += found
+
+                # 完成四个表格采集
+                if found_tables >= 3:
+                    print(f"已完成{year}年{month}月全部表格采集")
+                    processed_months.add((year, month))
+                    break
+
+                print(f"第{page}页已采集表格数:{found_tables}/3,前往下一页采集")
+                # 分页操作(增强定位稳定性)
+                WebDriverWait(driver, 15).until(
+                    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
+                ).click()
+
+                current_page += 1
+                page += 1
+
+
+            except TimeoutException:
+                print(f"未找到更多分页,已采集表格数:{found_tables}/3")
+                break
+            except Exception as e:
+                print(f"分页异常:{str(e)}")
+                handle_retry(driver)  # 异常恢复函数
+                break
+
+    return processed_months
+
+
+def extract_page_date(driver):
+    """增强型页面日期提取"""
+    try:
+        date_str = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        ).get_attribute("innerHTML")
+        match = re.search(r"(\d{4})年(\d{1,2})月", date_str)
+        return int(match.group(1)), int(match.group(2))
+    except:
+        return datetime.now().year, datetime.now().month
+
+def random_sleep(base=2, variance=5):
+    """智能随机等待"""
+    sleep_time = base + random.random() * variance
+    time.sleep(sleep_time)
+
+def handle_retry(driver):
+    """异常恢复处理"""
+    try:
+        driver.refresh()
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+        print("浏览器异常已恢复")
+    except:
+        print("需要人工干预的严重错误")
+        raise
+
+
+def main():
+    """主入口(优化参数处理逻辑)"""
+    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+    parser.add_argument('--year', type=int, default=None,
+                        help='终止年份(如2023),未指定时抓取最新两个月')
+    args = parser.parse_args()
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
+    try:
+        # 智能检测最新有效月份
+        valid_year, valid_month = detect_latest_month(driver)
+        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+
+        # 生成目标序列
+        if args.year:
+            # 指定年份时:从最新月到目标年1月
+            target_months = generate_month_sequence(
+                start_year=valid_year,
+                start_month=valid_month,
+                end_year=args.year,
+                skip_january=True
+            )
+        else:
+            # 未指定年份时:取最近两个月
+            target_months = generate_month_sequence(valid_year, valid_month)
+
+        print(f"目标采集月份序列:{target_months}")
+        reverse_crawler(driver, target_months)
+        print(f"{len(target_months)}个月份数据已采集完毕")
+
+    finally:
+        driver.quit()
+        print("\n数据清洗入库中...")
+        traverse_and_process(download_dir, parse_excel, province_name="henan")
+
+
+if __name__ == "__main__":
+    main()

+ 469 - 0
quanguo/CountryTrade.py

@@ -0,0 +1,469 @@
+import re
+
+import pandas as pd
+
+from utils.parse_utils import clean_county_name
+
+# 配置参数
+EXCEL_PATH = r"D:/Downloads/2025051809262394128.xls"
+OUTPUT_SQL = "../downloads/country_trade04.sql"
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
+
+COUNTRY_CODE_MAPPING = {
+    # ================= 亚洲 =================
+    "阿富汗": "AF",
+    "巴林": "BH",
+    "孟加拉国": "BD",
+    "不丹": "BT",
+    "文莱": "BN",
+    "缅甸": "MM",
+    "柬埔寨": "KH",
+    "塞浦路斯": "CY",
+    "朝鲜": "KP",
+    "中国香港": "HK",
+    "印度": "IN",
+    "印度尼西亚": "ID",
+    "伊朗": "IR",
+    "伊拉克": "IQ",
+    "以色列": "IL",
+    "日本": "JP",
+    "约旦": "JO",
+    "科威特": "KW",
+    "老挝": "LA",
+    "黎巴嫩": "LB",
+    "中国澳门": "MO",
+    "马来西亚": "MY",
+    "马尔代夫": "MV",
+    "蒙古": "MN",
+    "尼泊尔": "NP",
+    "阿曼": "OM",
+    "巴基斯坦": "PK",
+    "巴勒斯坦": "PS",
+    "菲律宾": "PH",
+    "卡塔尔": "QA",
+    "沙特阿拉伯": "SA",
+    "新加坡": "SG",
+    "韩国": "KR",
+    "斯里兰卡": "LK",
+    "叙利亚": "SY",
+    "泰国": "TH",
+    "土耳其": "TR",
+    "阿联酋": "AE",
+    "也门": "YE",
+    "越南": "VN",
+    "中国": "CN",
+    "中国台湾": "TW",
+    "哈萨克斯坦": "KZ",
+    "吉尔吉斯斯坦": "KG",
+    "塔吉克斯坦": "TJ",
+    "土库曼斯坦": "TM",
+    "乌兹别克斯坦": "UZ",
+    "格鲁吉亚": "GE",
+    "亚美尼亚": "AM",
+    "阿塞拜疆": "AZ",
+
+    # ================= 非洲 =================
+    "阿尔及利亚": "DZ",
+    "安哥拉": "AO",
+    "贝宁": "BJ",
+    "博茨瓦纳": "BW",
+    "布隆迪": "BI",
+    "喀麦隆": "CM",
+    "佛得角": "CV",
+    "中非": "CF",
+    "乍得": "TD",
+    "科摩罗": "KM",
+    "刚果共和国": "CG",
+    "吉布提": "DJ",
+    "埃及": "EG",
+    "赤道几内亚": "GQ",
+    "埃塞俄比亚": "ET",
+    "加蓬": "GA",
+    "冈比亚": "GM",
+    "加纳": "GH",
+    "几内亚": "GN",
+    "几内亚比绍": "GW",
+    "科特迪瓦": "CI",
+    "肯尼亚": "KE",
+    "莱索托": "LS",
+    "利比里亚": "LR",
+    "利比亚": "LY",
+    "马达加斯加": "MG",
+    "马拉维": "MW",
+    "马里": "ML",
+    "毛里塔尼亚": "MR",
+    "毛里求斯": "MU",
+    "摩洛哥": "MA",
+    "莫桑比克": "MZ",
+    "纳米比亚": "NA",
+    "尼日尔": "NE",
+    "尼日利亚": "NG",
+    "卢旺达": "RW",
+    "圣多美和普林西比": "ST",
+    "塞内加尔": "SN",
+    "塞舌尔": "SC",
+    "塞拉利昂": "SL",
+    "索马里": "SO",
+    "南非": "ZA",
+    "苏丹": "SD",
+    "坦桑尼亚": "TZ",
+    "多哥": "TG",
+    "突尼斯": "TN",
+    "乌干达": "UG",
+    "布基纳法索": "BF",
+    "刚果民主共和国": "CD",
+    "赞比亚": "ZM",
+    "津巴布韦": "ZW",
+    "厄立特里亚": "ER",
+    "南苏丹": "SS",
+
+    # ================= 欧洲 =================
+    "比利时": "BE",
+    "丹麦": "DK",
+    "英国": "GB",
+    "德国": "DE",
+    "法国": "FR",
+    "爱尔兰": "IE",
+    "意大利": "IT",
+    "卢森堡": "LU",
+    "荷兰": "NL",
+    "希腊": "GR",
+    "葡萄牙": "PT",
+    "西班牙": "ES",
+    "阿尔巴尼亚": "AL",
+    "奥地利": "AT",
+    "保加利亚": "BG",
+    "芬兰": "FI",
+    "匈牙利": "HU",
+    "冰岛": "IS",
+    "列支敦士登": "LI",
+    "马耳他": "MT",
+    "挪威": "NO",
+    "波兰": "PL",
+    "罗马尼亚": "RO",
+    "瑞典": "SE",
+    "瑞士": "CH",
+    "爱沙尼亚": "EE",
+    "拉脱维亚": "LV",
+    "立陶宛": "LT",
+    "白俄罗斯": "BY",
+    "摩尔多瓦": "MD",
+    "俄罗斯": "RU",
+    "乌克兰": "UA",
+    "斯洛文尼亚": "SI",
+    "克罗地亚": "HR",
+    "捷克": "CZ",
+    "斯洛伐克": "SK",
+    "北马其顿": "MK",
+    "波斯尼亚和黑塞哥维那": "BA",
+    "梵蒂冈": "VA",
+    "塞尔维亚": "RS",
+    "黑山": "ME",
+
+    # ================= 美洲 =================
+    "安提瓜和巴布达": "AG",
+    "阿根廷": "AR",
+    "巴哈马": "BS",
+    "巴巴多斯": "BB",
+    "伯利兹": "BZ",
+    "玻利维亚": "BO",
+    "巴西": "BR",
+    "加拿大": "CA",
+    "智利": "CL",
+    "哥伦比亚": "CO",
+    "哥斯达黎加": "CR",
+    "古巴": "CU",
+    "多米尼克": "DM",
+    "多米尼加": "DO",
+    "厄瓜多尔": "EC",
+    "萨尔瓦多": "SV",
+    "格林纳达": "GD",
+    "危地马拉": "GT",
+    "圭亚那": "GY",
+    "海地": "HT",
+    "洪都拉斯": "HN",
+    "牙买加": "JM",
+    "墨西哥": "MX",
+    "尼加拉瓜": "NI",
+    "巴拿马": "PA",
+    "巴拉圭": "PY",
+    "秘鲁": "PE",
+    "圣卢西亚": "LC",
+    "圣文森特和格林纳丁斯": "VC",
+    "苏里南": "SR",
+    "特立尼达和多巴哥": "TT",
+    "美国": "US",
+    "乌拉圭": "UY",
+    "委内瑞拉": "VE",
+    "圣基茨和尼维斯": "KN",
+
+    # ================= 大洋洲 =================
+    "澳大利亚": "AU",
+    "斐济": "FJ",
+    "基里巴斯": "KI",
+    "马绍尔群岛": "MH",
+    "密克罗尼西亚联邦": "FM",
+    "瑙鲁": "NR",
+    "新西兰": "NZ",
+    "帕劳": "PW",
+    "巴布亚新几内亚": "PG",
+    "萨摩亚": "WS",
+    "所罗门群岛": "SB",
+    "汤加": "TO",
+    "图瓦卢": "TV",
+    "瓦努阿图": "VU",
+
+    # ================= 特殊地区 =================
+    "法属圭亚那": "GF",
+    "瓜德罗普": "GP",
+    "留尼汪": "RE",
+    "圣马丁": "MF",
+    "荷属圣马丁": "SX",
+    "法属波利尼西亚": "PF",
+    "新喀里多尼亚": "NC",
+    "库克群岛": "CK",
+    "关岛": "GU",
+    "波多黎各": "PR",
+    "美属萨摩亚": "AS",
+    "百慕大": "BM",
+    "开曼群岛": "KY",
+    "福克兰群岛(马尔维纳斯)": "FK",
+    "格陵兰": "GL",
+    "法属南方领地": "TF",
+    "赫德岛和麦克唐纳岛": "HM",
+    "托克劳": "TK",
+    "纽埃": "NU",
+    "诺福克岛": "NF",
+    "北马里亚纳群岛": "MP",
+    "皮特凯恩": "PN",
+    "圣赫勒拿": "SH",
+    "斯瓦尔巴群岛和扬马延岛": "SJ",
+    "东帝汶": "TL",
+    # ==== 欧洲特殊地区 ====
+    "加那利群岛": "IC",  # 西班牙特殊领土代码
+    "塞卜泰(休达)": "XC",  # 休达官方代码
+    "梅利利亚": "XL",  # 梅利利亚官方代码
+    "安道尔": "AD",
+    "直布罗陀": "GI",
+    "摩纳哥": "MC",
+    "圣马力诺": "SM",
+    "法罗群岛": "FO",  # 丹麦自治领
+    "奥兰群岛": "AX",  # 芬兰自治省
+    "格恩西": "GG",  # 英国皇家属地
+    "马恩岛": "IM",
+    "泽西": "JE",
+
+    # ==== 非洲特殊地区 ====
+    "西撒哈拉": "EH",  # 争议地区代码
+    "斯威士兰": "SZ",  # 正式国名为"Eswatini"但保留旧映射
+    "马约特": "YT",  # 法国海外省
+
+    # ==== 美洲特殊地区 ====
+    "英属印度洋领地": "IO",
+    "阿鲁巴": "AW",
+    "库拉索": "CW",
+    "马提尼克": "MQ",  # 法国海外省
+    "蒙特塞拉特": "MS",
+    "法属圣马丁": "MF",
+    "特克斯和凯科斯群岛": "TC",
+    "英属维尔京群岛": "VG",
+    "博纳尔,圣俄斯塔休斯和萨巴": "BQ",
+    "圣巴泰勒米": "BL",  # 法国海外集体
+    "美属维尔京群岛": "VI",
+    "安圭拉": "AI",
+    "圣皮埃尔和密克隆": "PM",
+
+    # ==== 大洋洲特殊地区 ====
+    "瓦利斯和富图纳": "WF",
+    "科科斯(基林)群岛": "CC",
+    "圣诞岛": "CX",
+    "美国本土外小岛屿": "UM",
+
+    # ==== 特殊标记 ====
+    "布维岛": "BV",  # 挪威属地
+    "南乔治亚岛和南桑德韦奇岛": "GS",
+    "国家(地区)不明": "XX"  # 自定义代码
+}
+
+
+# 在代码中添加例外处理
+def get_country_code(chinese_name):
+    """带异常处理的国家编码查询"""
+    code = COUNTRY_CODE_MAPPING.get(chinese_name.strip(), None)
+
+    # 特殊处理逻辑
+    if not code:
+        if chinese_name.endswith("(地区)"):
+            return "N/A"  # 标记为地区
+        elif "国家联盟" in chinese_name:
+            return "ORG"  # 标记为国际组织
+    return code
+
+
+def parse_excel_to_sql():
+    """主处理函数"""
+    # 读取Excel文件(注意需要xlrd 1.2.0版本)
+    try:
+        df = pd.read_excel(EXCEL_PATH, engine='xlrd', header=None, dtype=str)
+    except ImportError:
+        raise ImportError("需要xlrd库支持.xls文件,请安装:pip install xlrd==1.2.0")
+    except Exception as e:
+        raise ValueError(f"读取Excel文件失败:{str(e)}")
+
+    # 解析年份月份
+    year_month = None
+    data_start = None
+    for idx, row in df.iterrows():
+        cell_value = str(row[1])
+        if "进出口商品国别" in cell_value:
+            match = re.search(r"(\d{4})年(\d{1,2})月", cell_value)
+            if match:
+                year = match.group(1)
+                month = match.group(2).zfill(2)
+                year_month = f"{year}-{month}"
+                # 调整起始行计算公式
+                data_start = idx + 6  # 原为+3,改为+4
+                break
+
+    if not year_month or data_start is None:
+        raise ValueError("无法解析年份月份信息,请检查Excel文件格式")
+
+    # 读取数据区域
+    try:
+        df_data = pd.read_excel(EXCEL_PATH, engine='xlrd', skiprows=data_start, header=None)
+    except Exception as e:
+        raise ValueError(f"读取数据区域失败:{str(e)}")
+
+    # 列映射配置(根据实际Excel结构调整)
+    COL_MAPPING = {
+        "country_name": 1,
+        "monthly_total": 2,
+        "monthly_export": 4,
+        "monthly_import": 6,
+        "ytd_total": 3,
+        "ytd_export": 5,
+        "ytd_import": 7,
+        "yoy_total": 8,
+        "yoy_export": 9,
+        "yoy_import": 10
+    }
+
+    # 生成SQL语句
+    sql_statements = []
+    for index, row in df_data.iterrows():
+        try:
+            # 获取国家名称
+            country_name = str(row[COL_MAPPING["country_name"]]).strip()
+
+            country_name = clean_county_name(country_name)
+
+            # 跳过空行和区域数据
+            if not country_name or country_name in EXCLUDE_REGIONS:
+                continue
+
+            # 获取国家代码
+            country_code = get_country_code(country_name)
+            # 新增:排除地区和组织代码
+            if country_code in ['N/A', 'ORG', 'XX'] or not country_code:
+                print(f"跳过地区/组织:{country_name}")
+                continue
+            if not country_code:
+                print(f"警告:第{index + data_start + 1}行 未找到国家代码 [{country_name}]")
+                continue
+
+            # 数值处理函数(万元转元)
+            # 修改后的数值处理函数
+            def convert_value(val):
+                """增强版数值转换,处理所有异常情况"""
+                try:
+                    # 处理特殊字符
+                    cleaned = re.sub(r'[^0-9\.\-\+]', '', str(val))
+                    # 处理空值和纯符号
+                    if cleaned in ('', '-', '+'):
+                        return 0.0
+                    # 处理科学计数法
+                    if 'e' in cleaned.lower():
+                        return float(cleaned)
+
+                    # 验证数值有效性
+                    parts = cleaned.split('.')
+                    if len(parts) > 2:  # 多个小数点
+                        return 0.0
+                    if '-' in cleaned and not cleaned.startswith('-'):  # 负号位置错误
+                        return 0.0
+
+                    return float(cleaned)
+                except:
+                    return 0.0
+
+            def safe_convert(val):
+                """安全转换函数,处理特殊字符和空值"""
+                # 处理空值和特殊占位符
+                if pd.isna(val) or str(val).strip() in ('', '-', 'NA'):
+                    return 'null'  # 返回Python的None,对应SQL的NULL
+
+                # 清理非数字字符
+                cleaned = re.sub(r'[^0-9\.\-]', '', str(val))
+
+                try:
+                    return round(float(cleaned), 2)
+                except:
+                    return 'null'
+
+            # 数据转换
+            values = {
+                "year_month": year_month,
+                "country_code": country_code,
+                "country_name": country_name.replace("'", "''"),  # 处理单引号
+                "monthly_total": convert_value(row[COL_MAPPING["monthly_total"]]),
+                "monthly_export": convert_value(row[COL_MAPPING["monthly_export"]]),
+                "monthly_import": convert_value(row[COL_MAPPING["monthly_import"]]),
+                "ytd_total": convert_value(row[COL_MAPPING["ytd_total"]]),
+                "ytd_export": convert_value(row[COL_MAPPING["ytd_export"]]),
+                "ytd_import": convert_value(row[COL_MAPPING["ytd_import"]]),
+                "yoy_total": safe_convert(row[COL_MAPPING["yoy_total"]]),
+                "yoy_export": safe_convert(row[COL_MAPPING["yoy_export"]]),
+                "yoy_import": safe_convert(row[COL_MAPPING["yoy_import"]])
+            }
+
+            # 构建SQL语句
+            sql = f"""INSERT INTO t_yujin_crossborder_country_trade (
+                `year_month`, country_code, country_name,
+                monthly_total, monthly_import, monthly_export,
+                ytd_total, ytd_import, ytd_export,
+                yoy_import_export, yoy_import, yoy_export
+            ) VALUES (
+                '{values["year_month"]}',
+                '{values["country_code"]}',
+                '{values["country_name"]}',
+                {values["monthly_total"]},
+                {values["monthly_import"]},
+                {values["monthly_export"]},
+                {values["ytd_total"]},
+                {values["ytd_import"]},
+                {values["ytd_export"]},
+                {values["yoy_total"]},
+                {values["yoy_export"]},
+                {values["yoy_import"]}
+            );"""
+            sql_statements.append(sql)
+        except Exception as e:
+            print(f"第{index + data_start + 1}行处理失败:{str(e)}")
+            continue
+
+    # 写入SQL文件
+    try:
+        with open(OUTPUT_SQL, "w", encoding="utf-8") as f:
+            f.write("\n".join(sql_statements))
+        print(f"成功生成 {len(sql_statements)} 条SQL语句,保存至:{OUTPUT_SQL}")
+    except Exception as e:
+        print(f"文件写入失败:{str(e)}")
+
+
+if __name__ == "__main__":
+    try:
+        parse_excel_to_sql()
+    except Exception as e:
+        print(f"程序执行失败:{str(e)}")

+ 178 - 0
quanguo/CountryTradeYear.py

@@ -0,0 +1,178 @@
+import os
+from decimal import Decimal
+
+import pandas as pd
+import pymysql
+from pymysql import Error
+
+from utils.constants import COUNTRY_CODE_MAPPING
+
+YEAR = 2023
+
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+
+
+# 在代码中添加例外处理
+def get_country_code(chinese_name):
+    """带异常处理的国家编码查询"""
+    code = COUNTRY_CODE_MAPPING.get(chinese_name.strip(), None)
+
+    # 特殊处理逻辑
+    if not code:
+        if chinese_name.endswith("(地区)"):
+            return "N/A"  # 标记为地区
+        elif "国家联盟" in chinese_name:
+            return "ORG"  # 标记为国际组织
+    return code
+
+
+def parse_value(val):
+    """增强型数值解析(含科学计数法处理),保留四位小数"""
+    if val in ('-', None, 'None', 'null'):
+        return None
+    try:
+        # 科学计数法处理(如1.2E+5),使用Decimal处理以避免浮动精度问题
+        if 'E' in str(val).upper():
+            return Decimal(val).quantize(Decimal('0.0000'))  # 用Decimal处理科学计数法,确保四位小数
+        return Decimal(str(val).replace(',', '')).quantize(Decimal('0.0000'))  # 保留四位小数
+    except Exception as e:
+        print(f"数值解析错误:{val},错误:{e}")
+        return None
+
+def parse_ratio(value):
+    """处理百分比数据"""
+    return value if value not in ['-', ''] else None
+
+def batch_upsert(conn, file_path, year_month):
+    """批量插入/更新数据"""
+    try:
+        # 读取Excel文件
+        df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
+    except Exception as e:
+        raise ValueError(f"文件读取失败:{str(e)}")
+
+    # 定位数据起始行(根据报表结构调整)
+    data_start = None
+    for idx, row in df.iterrows():
+        if "进出口商品国别" in str(row[1]):
+            data_start = idx + 6  # 根据实际表格结构调整
+            break
+
+    if not data_start:
+        raise ValueError("无法定位数据起始行")
+
+    # 读取数据区域
+    df_data = pd.read_excel(file_path, engine='xlrd',
+                            skiprows=data_start,
+                            header=None,
+                            usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])  # 根据实际列索引调整
+
+    # 准备批量插入数据
+    params = []
+    for _, row in df_data.iterrows():
+        country_name = str(row[1]).strip()
+
+        # 过滤地区和组织
+        if not country_name or country_name in EXCLUDE_REGIONS:
+            continue
+
+        # 获取国家代码
+        country_code = get_country_code(country_name)
+        if not country_code or country_code in ['N/A', 'ORG', 'XX']:
+            continue
+
+        # 数据转换(根据实际列索引调整)
+        values = (
+            year_month,
+            country_code,
+            country_name,
+            parse_value(row[2]),  # monthly_total
+            parse_value(row[4]),  # monthly_export
+            parse_value(row[6]),  # monthly_import
+            parse_value(row[3]),  # ytd_total
+            parse_value(row[5]),  # ytd_export
+            parse_value(row[7]),  # ytd_import
+            parse_ratio(row[8]),  # yoy_import_export
+            parse_ratio(row[9]),  # yoy_export
+            parse_ratio(row[10])  # yoy_import
+        )
+        params.append(values)
+
+    # 构建SQL模板(使用ON DUPLICATE KEY UPDATE)[9,10](@ref)
+    sql = """
+          INSERT INTO t_yujin_crossborder_country_trade (`year_month`, country_code, country_name, \
+                                                         monthly_total, monthly_export, monthly_import,  \
+                                                         ytd_total, ytd_export, ytd_import, \
+                                                         yoy_import_export, yoy_export, yoy_import) \
+          VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY \
+          UPDATE \
+              monthly_total = \
+          VALUES (monthly_total), monthly_import = \
+          VALUES (monthly_import), monthly_export = \
+          VALUES (monthly_export), ytd_total = \
+          VALUES (ytd_total), ytd_import = \
+          VALUES (ytd_import), ytd_export = \
+          VALUES (ytd_export), yoy_import_export = \
+          VALUES (yoy_import_export), yoy_import = \
+          VALUES (yoy_import), yoy_export = \
+          VALUES (yoy_export) \
+          """
+
+    # 执行批量操作[1,4](@ref)
+    try:
+        with conn.cursor() as cursor:
+            cursor.executemany(sql, params)
+            return cursor.rowcount
+    except Error as e:
+        conn.rollback()
+        raise RuntimeError(f"数据库操作失败:{str(e)}")
+
+
+def main():
+    """主执行流程"""
+    conn = None
+    try:
+        # 建立数据库连接
+        conn = pymysql.connect(**DB_CONFIG)
+
+        # 按月份顺序处理
+        for month in range(1, 13):
+            # 构建文件路径
+            folder = f"downloads/{YEAR}/{month:02d}月"
+            file_name = f"(2){YEAR}年进出口商品国别(地区)总值表.xls"
+            file_path = os.path.join(folder, file_name)
+
+            if not os.path.isfile(file_path):
+                print(f"⚠️ 文件不存在:{file_path}")
+                continue
+
+            # 生成年月标识
+            year_month = f"{YEAR}-{month:02d}"
+
+            print(f"⌛ 正在处理 {year_month} 数据...")
+            try:
+                count = batch_upsert(conn, file_path, year_month)
+                conn.commit()
+                print(f"✅ 成功更新 {year_month},影响 {count} 条记录")
+            except Exception as e:
+                print(f"❌ {year_month} 处理失败:{str(e)}")
+                conn.rollback()
+
+    except Error as e:
+        print(f"数据库连接失败:{str(e)}")
+    finally:
+        if conn and conn.open:
+            conn.close()
+        print("🏁 所有月份处理完成")
+
+
+if __name__ == "__main__":
+    main()

+ 147 - 0
quanguo/CrossDownload.py

@@ -0,0 +1,147 @@
+import random
+import re
+import time
+from pathlib import Path
+
+from faker import Faker
+from playwright.sync_api import sync_playwright
+
+TARGET_TABLES = [
+    "(1)2025年进出口商品总值表 A:年度表",
+    "(1)2025年进出口商品总值表 B:月度表",
+    "(2)2025年进出口商品国别(地区)总值表",
+    "(4)2025年进出口商品类章总值表",
+    "(8)2025年进出口商品收发货人所在地总值表",
+    "(15)2025年对部分国家(地区)出口商品类章金额表",
+    "(16)2025年自部分国家(地区)进口商品类章金额表"
+]
+
+
+def generate_dynamic_fingerprint(page):
+    """增强型浏览器指纹生成方案(整合网页1、8技术)"""
+    # ...保持不变...
+
+
+def process_table_row(row):
+    """动态元素处理方案(网页4建议)"""
+    # 使用locator替代静态查询
+    cells = row.locator('td').all()
+    if len(cells) < 2:
+        return None
+
+    try:
+        table_name = cells[0].inner_text(timeout=5000).strip()
+        month_links = [
+            (int(a.inner_text().replace('月', '')), a.get_attribute('href'))
+            for a in cells[1].locator('a.blue').all()
+            if a.is_visible()
+        ]
+        month_links.sort(reverse=True, key=lambda x: x[0])
+        return (table_name, month_links)
+    except Exception as e:
+        print(f"行处理异常: {str(e)}")
+        return None
+
+
+def download_excel(page, table_name, month_data):
+    """优化后的下载方法(整合网页6、7、8方案)"""
+    max_month, max_link = month_data
+    safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
+
+    try:
+        # 直接访问下载链接(网页6技术)
+        with page.expect_download() as download_info:
+            page.goto(f"http://www.customs.gov.cn{max_link}",
+                      wait_until="networkidle",
+                      timeout=60000)
+
+            # 精准定位下载元素(适配新页面结构)
+            download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]')
+            download_btn.click(timeout=15000)
+
+        download = download_info.value
+        file_ext = download.suggested_filename.split('.')[-1]
+        file_name = f"{safe_name}_{max_month}月.{file_ext}"
+
+        # 创建下载目录
+        download_path = Path('../src/downloads') / f"{time.strftime('%Y%m%d')}"
+        download_path.mkdir(parents=True, exist_ok=True)
+
+        # 保存文件(网页8方案)
+        final_path = download_path / file_name
+        download.save_as(final_path)
+        print(f"√ 成功下载: {file_name}")
+
+        # 返回原始页面(关键修复点)
+        page.go_back()
+        page.wait_for_load_state('networkidle')
+        return final_path
+
+    except Exception as e:
+        print(f"× 下载失败 {table_name}: {str(e)}")
+        page.screenshot(path=f'error_{safe_name}.png')
+        raise
+
+
+def crawl_with_fingerprint(url):
+    with sync_playwright() as p:
+        browser = p.firefox.launch(
+            headless=True,
+            args=[
+                '--disable-blink-features=AutomationControlled',
+                '--lang=zh-CN',
+                '--window-size=1440,900'
+            ]
+        )
+
+        context = browser.new_context(
+            user_agent=Faker().firefox(),
+            viewport={'width': 1440, 'height': 900},
+            device_scale_factor=1,
+            accept_downloads=True,  # 关键参数(网页7建议)
+            extra_http_headers={
+                "Host": "www.customs.gov.cn",
+                "Accept-Language": "zh-CN,zh;q=0.9"
+            }
+        )
+
+        try:
+            page = context.new_page()
+            page.add_init_script("""
+                Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+                window.alert = () => {};
+            """)
+
+            # 优化后的表格处理流程
+            page.goto(url, wait_until="networkidle", timeout=60000)
+            rows_locator = page.locator('#yb2025RMB tr')
+            for i in range(1, rows_locator.count()):
+                row = rows_locator.nth(i)
+                if not row.is_visible():
+                    continue
+
+                result = process_table_row(row)
+                if not result: continue
+
+                table_name, month_links = result
+                if table_name not in TARGET_TABLES: continue
+                if not month_links: continue
+
+                try:
+                    download_excel(page, table_name, month_links[0])
+                    time.sleep(random.uniform(2, 5))  # 随机等待
+                except Exception as e:
+                    print(f"表格处理中断: {str(e)}")
+                    break
+
+                # 释放元素引用(关键修复点)
+                row.evaluate('element => element.remove()')
+
+        finally:
+            context.close()
+            browser.close()
+
+
+if __name__ == "__main__":
+    Path('../src/downloads').mkdir(exist_ok=True)
+    crawl_with_fingerprint("http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html")

+ 148 - 0
quanguo/CrossDownloadYear.py

@@ -0,0 +1,148 @@
+from playwright.sync_api import sync_playwright
+import re
+import time
+import random
+from pathlib import Path
+from faker import Faker
+
+YEAR = 2023
+
+TARGET_TABLES = [
+    f"(2){YEAR}年进出口商品国别(地区)总值表",
+    f"(4){YEAR}年进出口商品类章总值表",
+    f"(8){YEAR}年进出口商品收发货人所在地总值表",
+    f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
+    f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
+]
+
+def process_table_row(row):
+    """动态处理表格行数据(整合网页2表格解析方案)"""
+    try:
+        cells = row.locator('td').all()
+        if len(cells) < 2:
+            return None
+
+        table_name = cells[0].inner_text(timeout=8000).strip()
+        month_links = [
+            (int(a.inner_text().replace('月', '')), a.get_attribute('href'))
+            for a in cells[1].locator('a').all()
+            if a.is_visible() and a.get_attribute('href')
+        ]
+
+        # 按月份升序排列(1-12月)
+        month_links.sort(key=lambda x: x[0])
+        return (table_name, month_links)
+    except Exception as e:
+        print(f"表格行处理异常: {str(e)}")
+        return None
+
+
+def download_monthly_data(page, table_name, month_data):
+    """下载单月数据文件(整合网页4、网页6存储方案)"""
+    month_num, link = month_data
+    safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
+
+    try:
+        with page.expect_download() as download_info:
+            page.goto(f"http://www.customs.gov.cn{link}",
+                      wait_until="networkidle",
+                      timeout=80000)
+
+            # 通用下载按钮定位策略(适配不同页面结构)
+            download_btn = page.locator('span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]')
+            download_btn.click(timeout=15000)
+
+        download = download_info.value
+        file_ext = download.suggested_filename.split('.')[-1] if '.' in download.suggested_filename else 'xls'
+
+        # 创建月份目录(网页6路径规范)
+        download_dir = Path('../src/downloads') / f"{YEAR}/{month_num:02d}月"
+        download_dir.mkdir(parents=True, exist_ok=True)
+
+        # 规范文件命名
+        final_path = download_dir / f"{safe_name}.{file_ext}"
+        download.save_as(final_path)
+        print(f"√ 成功下载:{final_path}")
+
+        # 返回原始页面并等待恢复(网页8状态管理)
+        page.go_back()
+        page.wait_for_load_state('networkidle')
+        return True
+
+    except Exception as e:
+        print(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
+        page.screenshot(path=f'error_{safe_name}_{month_num:02d}.png')
+        return False
+
+
+def crawl_with_fingerprint(url):
+    with sync_playwright() as p:
+        browser = p.firefox.launch(
+            headless=True,
+            args=[
+                '--disable-blink-features=AutomationControlled',
+                '--lang=zh-CN',
+                '--window-size=1440,900'
+            ]
+        )
+
+        context = browser.new_context(
+            user_agent=Faker().firefox(),
+            viewport={'width': 1440, 'height': 900},
+            device_scale_factor=1,
+            accept_downloads=True,
+            extra_http_headers={
+                "Host": "www.customs.gov.cn",
+                "Accept-Language": "zh-CN,zh;q=0.9"
+            }
+        )
+
+        try:
+            page = context.new_page()
+            page.add_init_script("""
+                Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+                window.alert = () => {};
+            """)
+
+            # 访问目标页面
+            page.goto(url, wait_until="networkidle", timeout=60000)
+            rows = page.locator(f'#yb{YEAR}RMB tr').all()[1:]  # 跳过标题行
+            print(f"共找到 {len(rows)} 个表格")
+            for row in rows:
+                result = process_table_row(row)
+                if not result:
+                    continue
+
+                table_name, month_links = result
+                if table_name not in TARGET_TABLES:
+                    continue
+
+                print(f"\n开始处理表格:{table_name}")
+
+                # 遍历所有月份(整合网页2遍历方案)
+                for month_data in month_links:
+                    month_num = month_data[0]
+                    if 1 <= month_num <= 12:  # 过滤有效月份
+                        retry_count = 0
+                        while retry_count < 2:  # 失败重试机制
+                            if download_monthly_data(page, table_name, month_data):
+                                break
+                            retry_count += 1
+                            time.sleep(5)
+
+                        # 随机等待(网页7反爬建议)
+                        time.sleep(random.uniform(3, 8))
+
+                # 释放元素引用(网页8内存管理)
+                row.evaluate('element => element.remove()')
+
+        finally:
+            context.close()
+            browser.close()
+
+
+if __name__ == "__main__":
+    Path('../src/downloads').mkdir(exist_ok=True)
+    target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html"
+    crawl_with_fingerprint(target_url)
+    print("全年数据下载任务已完成")

+ 38 - 0
quanguo/ScrpyDownload.py

@@ -0,0 +1,38 @@
+import scrapy
+from scrapy.cmdline import execute
+import os
+import sys
+
+
+class CustomSpider(scrapy.Spider):
+    name = 'customs_gov'
+    allowed_domains = ['www.customs.gov.cn']
+
+    # 显式定义入口请求(网页6、网页7)
+    def start_requests(self):
+        urls = [
+            'http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/4899681/index.html'
+        ]
+        for url in urls:
+            yield scrapy.Request(
+                url=url,
+                callback=self.parse,
+                headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+                    'Referer': 'https://www.customs.gov.cn/'
+                }
+            )
+
+    def parse(self, response):
+        # 调试响应状态(网页4)
+        if response.status == 200:
+            print("===== 响应内容片段 =====")
+            print(response.text[:1000])
+        else:
+            self.logger.error(f"请求失败,状态码: {response.status}")
+
+
+# 添加IDE直接运行入口(网页1、网页3)
+if __name__ == "__main__":
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    execute(['scrapy', 'crawl', 'customs_gov'])

+ 0 - 0
quanguo/__init__.py


+ 87 - 0
quanguo/commodity_trade.py

@@ -0,0 +1,87 @@
+import re
+import openpyxl
+from decimal import Decimal
+import pandas as pd
+
+from quanguo.detail import parse_hscode
+
+
+def chinese_class_to_number(class_str):
+    """精确匹配中文类名"""
+    cn_num_map = {
+        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
+        '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16,
+        '十七': 17, '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22
+    }
+    match = re.match(r'^第([一二三四五六七八九十百千万]+)类$', class_str)
+    return cn_num_map.get(match.group(1), 0) if match else 0
+
+
+def parse_value(val):
+    """数值处理(包含科学计数法处理)"""
+    if val in ('-', None, 'None', 'null'):
+        return 'NULL'
+    try:
+        # 处理科学计数法(如1.2E+5)
+        if 'E' in str(val).upper():
+            return Decimal(f"{float(val):.2f}").quantize(Decimal('0.00'))
+        return Decimal(str(val).replace(',', '')).quantize(Decimal('0.00'))
+    except Exception as e:
+        print(f"数值解析错误:{val},错误:{e}")
+        return 'NULL'
+
+
+# 数据预处理
+
+df = pd.read_excel(r"D:/Downloads/2025051809313028789.xls", engine="xlrd")
+df.to_excel("converted.xlsx", index=False)
+wb = openpyxl.load_workbook('converted.xlsx')
+sheet = wb.active
+
+current_class = None
+year_month = '2025-04'
+sql_scripts = []
+
+# 遍历所有数据行(示例数据从第7行开始)
+for row_idx, row in enumerate(sheet.iter_rows(min_row=7, values_only=True), start=7):
+    if not row[1]: continue
+
+    hs_code = parse_hscode(row[1])
+    if not hs_code:
+        print(f"[行{row_idx}] 忽略无效行:{row[1]}")
+        continue
+
+    # 类行数据插入
+    try:
+        monthly_export = parse_value(row[2])
+        ytd_export = parse_value(row[3])
+        monthly_import = parse_value(row[4])
+        ytd_import = parse_value(row[5])
+        ytd_yoy_export = parse_value(row[6])
+        ytd_yoy_import = parse_value(row[7])
+    except IndexError as e:
+        print(f"行{row_idx} 列索引错误:{e}")
+        continue
+
+    sql = f"""INSERT INTO `t_yujin_crossborder_commodity_trade` 
+    (`year_month`, `hs_code`, `monthly_import`, `monthly_export`, 
+    `ytd_import`, `ytd_export`, `ytd_yoy_import`, `ytd_yoy_export`)
+    VALUES ('{year_month}', '{hs_code}', 
+    {monthly_import}, {monthly_export}, 
+    {ytd_import}, {ytd_export}, 
+    {ytd_yoy_import}, {ytd_yoy_export})
+    ON DUPLICATE KEY UPDATE 
+    monthly_import = VALUES(monthly_import),
+    monthly_export = VALUES(monthly_export),
+    ytd_import = VALUES(ytd_import),
+    ytd_export = VALUES(ytd_export),
+    ytd_yoy_import = VALUES(ytd_yoy_import),
+    ytd_yoy_export = VALUES(ytd_yoy_export);"""
+    sql_scripts.append(sql)
+    continue
+
+# 结果输出
+with open('../downloads/commodity_trade04.sql', 'w', encoding='utf-8') as f:
+    f.write('\n'.join(sql_scripts))
+
+print(f"生成完成,共处理 {len(sql_scripts)} 条有效记录")

+ 193 - 0
quanguo/commodity_trade_year.py

@@ -0,0 +1,193 @@
+import os
+import re
+import pymysql
+from decimal import Decimal
+from pymysql import Error
+import xlrd
+
+# 数据库配置
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+# 全局参数
+YEAR = 2023  # 目标年份
+BASE_DIR = "../src/downloads"  # 下载目录基础路径
+
+
+def chinese_class_to_number(class_str):
+    """中文类名转数字(保持原有实现)"""
+    cn_num_map = {
+        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6,
+        '七': 7, '八': 8, '九': 9, '十': 10, '十一': 11, '十二': 12,
+        '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17,
+        '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22
+    }
+    match = re.match(r'^第([一二三四五六七八九十百千万]+)类.*$', class_str)
+    return cn_num_map.get(match.group(1), 0) if match else 0
+
+
+def parse_value(val):
+    """增强型数值解析(含科学计数法处理),保留四位小数"""
+    if val in ('-', None, 'None', 'null'):
+        return None
+    try:
+        # 科学计数法处理(如1.2E+5),使用Decimal处理以避免浮动精度问题
+        if 'E' in str(val).upper():
+            return Decimal(val).quantize(Decimal('0.0000'))  # 用Decimal处理科学计数法,确保四位小数
+        return Decimal(str(val).replace(',', '')).quantize(Decimal('0.0000'))  # 保留四位小数
+    except Exception as e:
+        print(f"数值解析错误:{val},错误:{e}")
+        return None
+
+
+def process_month_file(conn, file_path, year_month):
+    """处理单个月份文件"""
+    try:
+        workbook = xlrd.open_workbook(file_path)
+        sheet = workbook.sheet_by_index(0)
+    except Exception as e:
+        print(f"文件打开失败:{file_path}\n错误:{str(e)}")
+        return 0
+
+    cursor = conn.cursor()
+    params = []
+    current_class = None
+
+    # 遍历数据行(从第7行开始)
+    for row_idx in range(6, sheet.nrows):
+        try:
+            row = sheet.row_values(row_idx)
+            cell_value = str(row[1]).strip() if len(row) > 1 else ""
+
+            # 数据清洗
+            clean_value = re.sub(r'\s+', ' ', cell_value)
+            parts = clean_value.split(' ', 1)
+
+            if not parts:
+                continue
+
+            identifier = parts[0]
+
+            # 解析章数据
+            if re.match(r'^\d+章$', identifier):
+                if current_class:
+                    chapter_num = re.findall(r'\d+', identifier)[0].zfill(2)
+                    hs_code = f"{current_class}{chapter_num}"
+
+                    # 提取各列数据(根据实际表格结构调整)
+                    data_fields = [
+                        parse_value(row[2]),  # monthly_export
+                        parse_value(row[3]),  # ytd_export
+                        parse_value(row[4]),  # monthly_import
+                        parse_value(row[5]),  # ytd_import
+                        parse_value(row[6]),  # ytd_yoy_export
+                        parse_value(row[7])  # ytd_yoy_import
+                    ]
+
+                    params.append((
+                        year_month,
+                        hs_code,
+                        *data_fields
+                    ))
+
+            # 解析类数据
+            elif re.match(r'^第([一二三四五六七八九十百千万]+)类.*$', identifier):
+                class_num = chinese_class_to_number(identifier)
+                if 1 <= class_num <= 22:
+                    current_class = f"{class_num:02d}"
+
+                    # 类级别数据插入
+                    class_data = [
+                        parse_value(row[2]),  # monthly_export
+                        parse_value(row[3]),  # ytd_export
+                        parse_value(row[4]),  # monthly_import
+                        parse_value(row[5]),  # ytd_import
+                        parse_value(row[6]),  # ytd_yoy_export
+                        parse_value(row[7])  # ytd_yoy_import
+                    ]
+
+                    params.append((
+                        year_month,
+                        current_class,
+                        *class_data
+                    ))
+
+        except Exception as e:
+            print(f"行{row_idx}处理失败:{str(e)}")
+            continue
+
+    # 批量执行SQL[3,9](@ref)
+    sql = """
+          INSERT INTO `t_yujin_crossborder_commodity_trade`
+          (`year_month`, `hs_code`, `monthly_export`, `ytd_export`,
+           `monthly_import`, `ytd_import`, `ytd_yoy_export`, `ytd_yoy_import`)
+          VALUES (%s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY \
+          UPDATE \
+              monthly_export = \
+          VALUES (monthly_export), ytd_export = \
+          VALUES (ytd_export), monthly_import = \
+          VALUES (monthly_import), ytd_import = \
+          VALUES (ytd_import), ytd_yoy_export = \
+          VALUES (ytd_yoy_export), ytd_yoy_import = \
+          VALUES (ytd_yoy_import) \
+          """
+
+    try:
+        # 分批次提交(每批1000条)[9](@ref)
+        batch_size = 1000
+        for i in range(0, len(params), batch_size):
+            cursor.executemany(sql, params[i:i + batch_size])
+            conn.commit()
+        return len(params)
+    except Error as e:
+        conn.rollback()
+        print(f"数据库操作失败:{str(e)}")
+        return 0
+    finally:
+        cursor.close()
+
+
+def main():
+    """主处理流程"""
+    conn = None
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+
+        # 顺序处理1-12月数据[6](@ref)
+        for month in range(1, 13):
+            # 构建文件路径
+            folder = f"{BASE_DIR}/{YEAR}/{month:02d}月"
+            file_name = f"(4){YEAR}年进出口商品类章总值表.xls"
+            file_path = os.path.join(folder, file_name)
+            file_path = os.path.normpath(file_path)
+            if not os.path.exists(file_path):
+                print(f"文件不存在:{file_path}")
+                continue
+
+            # 生成年月标识
+            year_month = f"{YEAR}-{month:02d}"
+            print(f"⌛ 正在处理 {year_month} 数据...")
+
+            try:
+                count = process_month_file(conn, file_path, year_month)
+                print(f"✅ 成功更新 {year_month},影响 {count} 条记录")
+            except Exception as e:
+                print(f"❌ {year_month} 处理失败:{str(e)}")
+                conn.rollback()
+
+    except Error as e:
+        print(f"数据库连接失败:{str(e)}")
+    finally:
+        if conn and conn.open:
+            conn.close()
+        print("🏁 所有月份处理完成")
+
+
+if __name__ == "__main__":
+    main()

BIN
quanguo/converted.xlsx


BIN
quanguo/converted_export.xlsx


BIN
quanguo/converted_import.xlsx


+ 154 - 0
quanguo/customs_data.json

@@ -0,0 +1,154 @@
+[
+  {
+    "表名": "(1)2025年进出口商品总值表 A:年度表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6421961/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422330/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470424/index.html"
+    ]
+  },
+  {
+    "表名": "(1)2025年进出口商品总值表 B:月度表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6421990/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422344/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470388/index.html"
+    ]
+  },
+  {
+    "表名": "(2)2025年进出口商品国别(地区)总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422001/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422365/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470374/index.html"
+    ]
+  },
+  {
+    "表名": "(3)2025年进出口商品构成表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422019/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422378/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470359/index.html"
+    ]
+  },
+  {
+    "表名": "(4)2025年进出口商品类章总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422031/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422434/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470352/index.html"
+    ]
+  },
+  {
+    "表名": "(5)2025年进出口商品贸易方式总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422038/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422482/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470327/index.html"
+    ]
+  },
+  {
+    "表名": "(6)2025年出口商品贸易方式企业性质总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422064/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422515/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470318/index.html"
+    ]
+  },
+  {
+    "表名": "(7)2025年进口商品贸易方式企业性质总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422090/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422527/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470306/index.html"
+    ]
+  },
+  {
+    "表名": "(8)2025年进出口商品收发货人所在地总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422418/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422542/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470287/index.html"
+    ]
+  },
+  {
+    "表名": "(9)2025年进出口商品境内目的地/货源地总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422112/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422575/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470230/index.html"
+    ]
+  },
+  {
+    "表名": "(10)2025年进出口商品关别总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422174/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422588/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470222/index.html"
+    ]
+  },
+  {
+    "表名": "(11)2025年特定地区进出口总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422201/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422606/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470210/index.html"
+    ]
+  },
+  {
+    "表名": "(12)2025年外商投资企业进出口总值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422221/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422624/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470202/index.html"
+    ]
+  },
+  {
+    "表名": "(13)2025年出口主要商品量值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422711/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422725/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470442/index.html"
+    ]
+  },
+  {
+    "表名": "(14)2025年进口主要商品量值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422284/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422637/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470190/index.html"
+    ]
+  },
+  {
+    "表名": "(15)2025年对部分国家(地区)出口商品类章金额表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422297/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422641/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470153/index.html"
+    ]
+  },
+  {
+    "表名": "(16)2025年自部分国家(地区)进口商品类章金额表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422306/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422666/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470132/index.html"
+    ]
+  },
+  {
+    "表名": "(17)2025年部分出口商品主要贸易方式量值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422317/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422678/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470117/index.html"
+    ]
+  },
+  {
+    "表名": "(18)2025年部分进口商品主要贸易方式量值表",
+    "月份链接": [
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422321/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6422692/index.html",
+      "/customs/302249/zfxxgk/2799825/302274/302277/302276/6470101/index.html"
+    ]
+  }
+]

+ 166 - 0
quanguo/detail.py

@@ -0,0 +1,166 @@
+# ---------------------------- 核心解析逻辑修改 ----------------------------
+from quanguo.CountryTradeYear import COUNTRY_CODE_MAPPING
+from utils.parse_utils import clean_county_name
+
+
+def chinese_class_to_number(class_str):
+    """精确匹配中文类名"""
+    cn_num_map = {
+        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
+        '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16,
+        '十七': 17, '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22
+    }
+    match = re.match(r'^第([一二三四五六七八九十百千万]+)类$', class_str)
+    return cn_num_map.get(match.group(1), 0) if match else 0
+
+
+def parse_hscode(row_value):
+    """增强版HS编码解析,支持更多格式"""
+    global current_class
+
+    # 清洗输入值(新增删除中文括号内容)
+    cell_value = str(row_value).strip()
+    cell_value = re.sub(r'\s+', ' ', cell_value)  # 统一空格
+    cell_value = re.sub(r'[((].*[))]', '', cell_value)  # 删除括号内容
+    if not cell_value:
+        return None
+
+    # 匹配中文类名(如 “第一类 动物”)
+    class_cn_match = re.match(r'^第([一二三四五六七八九十百千万]+)类', cell_value)
+    if class_cn_match:
+        class_num = chinese_class_to_number(f"第{class_cn_match.group(1)}类")
+        current_class = f"{class_num:02d}"
+        return current_class
+
+    # 匹配数字类名(如 "第5类 矿产品")
+    class_digit_match = re.match(r'^第\s*(\d+)\s*类', cell_value)
+    if class_digit_match:
+        class_num = class_digit_match.group(1)
+        current_class = f"{int(class_num):02d}"
+        return current_class
+
+    # 匹配独立章名(如 "01章 活动物" 或 "第一章 工业制成品")
+    chapter_match = re.match(r'^(?:第)?\s*([一二三四五六七八九十百千万]+|\d{1,2})\s*章', cell_value)
+    if chapter_match:
+        chapter_str = chapter_match.group(1)
+
+        if not chapter_str.isdigit():
+            chapter_digit = chinese_class_to_number(chapter_str + "章")
+        else:
+            chapter_digit = int(chapter_str)
+
+        if current_class:
+            return f"{current_class}{chapter_digit:02d}"
+        else:
+            return f"{chapter_digit:02d}"
+
+    return None
+
+
+
+
+# ---------------------------- 完整代码集成 ----------------------------
+import re
+import openpyxl
+from decimal import Decimal
+import pandas as pd
+
+
+
+# ---------------------------- 全局配置 ----------------------------
+current_class = None
+year_month = '2025-04'
+
+def parse_value(value):
+    """增强型数值解析,处理空值和特殊符号"""
+    try:
+        cleaned = str(value).strip().replace(',', '')
+        # 处理空值和"-"
+        if cleaned in ('', '-', 'NA', 'None'):
+            return 0  # 关键修改:将"-"转换为0
+        return Decimal(cleaned).quantize(Decimal('0.0000'))
+    except Exception:
+        return 0  # 所有异常情况返回0
+
+def generate_country_sql(file_path, trade_type):
+    """生成国家维度SQL脚本(支持数字类名)"""
+    df = pd.read_excel(file_path, engine='xlrd')
+    converted_path = f"converted_{trade_type}.xlsx"
+    df.to_excel(converted_path, index=False)
+
+    wb = openpyxl.load_workbook(converted_path)
+    sheet = wb.active
+    sql_scripts = []
+    global current_class
+
+    # 遍历行数据(从第7行开始)
+    for row_idx, row in enumerate(sheet.iter_rows(min_row=7, values_only=True), start=7):
+        if not row[1]:
+            continue
+
+        # 解析HS编码(核心修改)
+        hs_code = parse_hscode(row[1])
+        if not hs_code:
+            print(f"[行{row_idx}] 忽略无效行:{row[1]}")
+            continue
+
+        # 国家数据处理(保持原逻辑)
+        country_data = []
+        for col_idx in range(2, len(row), 2):
+            # 获取国家名称(假设国家名在表头第4行)
+            country_name = sheet.cell(row=4, column=col_idx + 1).value
+            if not country_name:
+                continue
+
+            country_name = clean_county_name(country_name)
+            country_code = COUNTRY_CODE_MAPPING.get(country_name, 'XX')
+            if  country_code == 'XX':
+                # print(f"[行{row_idx}] 忽略未知国家:{country_name}")
+                continue
+
+            # 关键修改:读取国家对应的month和cumulative列
+            month_val = parse_value(row[col_idx])
+            cumulative_val = parse_value(row[col_idx + 1]) if col_idx + 1 < len(row) else 0
+
+            country_data.append({
+                'country_code': country_code,
+                'country_name': country_name,
+                'month': month_val,
+                'cumulative': cumulative_val  # 添加累计值
+            })
+
+        # 生成SQL(新增数字类兼容)
+        if country_data:
+            try:
+                for country in country_data:
+                    # 使用国家专属的month/cumulative值(关键修改)
+                    sql = f"""INSERT INTO `t_yujin_crossborder_commodity_country`
+                    (`year_month`,`country_code`,`country_name`,`hs_code`,`trade_type`,
+                    `month_amount`,`cumulative_amount`)
+                    VALUES (
+                        '{year_month}',
+                        '{country['country_code']}',
+                        '{country['country_name']}',
+                        '{hs_code}',
+                        '{trade_type}',
+                        {country['month']},  # 使用国家维度数据
+                        {country['cumulative']}  # 使用国家维度数据
+                    ) ON DUPLICATE KEY UPDATE
+                        month_amount = VALUES(month_amount),
+                        cumulative_amount = VALUES(cumulative_amount);"""
+                    sql_scripts.append(sql)
+            except Exception as e:
+                print(f"[行{row_idx}] 数据异常:{str(e)}")
+
+    return sql_scripts
+
+
+# 执行入口保持不变
+if __name__ == "__main__":
+    export_sql = generate_country_sql(r"C:/Users/admin/PyCharmMiscProject/cross/2025051809572148978.xls", "export")
+    import_sql = generate_country_sql(r"C:/Users/admin/PyCharmMiscProject/cross/2025051809593876500.xls", "import")
+
+    with open('../downloads/commodity_country04.sql', 'w', encoding='utf-8') as f:
+        f.write('\n'.join(export_sql + import_sql))
+
+    print(f"生成完成,出口{len(export_sql)}条,进口{len(import_sql)}条")

+ 167 - 0
quanguo/detail_year.py

@@ -0,0 +1,167 @@
+# ---------------------------- 核心解析逻辑修改 ----------------------------
+import os
+import re
+from decimal import Decimal
+import xlrd
+import pymysql
+from pymysql import Error
+
+from quanguo.CountryTrade import COUNTRY_CODE_MAPPING
+
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+# 全局参数
+YEAR = 2023  # 目标年份
+BATCH_SIZE = 1000  # 每批插入量
+
+
+def parse_value(value):
+    """增强型数值解析,处理空值和特殊符号"""
+    try:
+        cleaned = str(value).strip().replace(',', '')
+        if cleaned in ('', '-', 'NA', 'None'):
+            return Decimal('0.0000')
+        return Decimal(cleaned).quantize(Decimal('0.0000'))
+    except Exception:
+        return Decimal('0.0000')
+
+
+def parse_hscode(row_value):
+    """解析类/章信息(增强格式兼容性)"""
+    global current_class
+
+    cell_value = str(row_value).strip()
+    clean_value = re.sub(r'\s+', ' ', cell_value)  # 合并连续空格
+
+    # 匹配类格式:第X类(兼容空格)
+    if class_match := re.match(r'^第\s*(\d+)\s*类\b.*', clean_value):
+        class_num = class_match.group(1)
+        current_class = f"{int(class_num):02d}"  # 两位数格式化
+        return current_class
+
+    # 匹配章格式:XX章(兼容前导零)
+    if chapter_match := re.match(r'^(\d+)\s*章\b.*', clean_value):
+        if current_class:
+            chapter_num = chapter_match.group(1).zfill(2)  # 自动补零
+            return f"{current_class}{chapter_num}"
+
+    return None
+
+def process_month_file(conn, folder, month, trade_type):
+    """处理单月数据文件"""
+    global current_class
+    current_class = None
+    # 构建文件路径
+    file_prefix = "(15)" if trade_type == "export" else "(16)"
+    file_name = f"{file_prefix}{YEAR}年{'自' if trade_type == 'import' else '对'}部分国家(地区){'进' if trade_type == 'import' else '出'}口商品类章金额表.xls"
+    file_path = os.path.join(folder, f"{month:02d}月", file_name)
+    file_path = os.path.normpath(file_path)
+    if not os.path.exists(file_path):
+        print(f"文件不存在:{file_path}")
+        return 0
+
+    try:
+        workbook = xlrd.open_workbook(file_path)
+        sheet = workbook.sheet_by_index(0)
+    except Exception as e:
+        print(f"文件打开失败:{file_path}\n错误:{str(e)}")
+        return 0
+
+    params = []
+    year_month = f"{YEAR}-{month:02d}"
+
+    # 遍历数据行(从第7行开始)
+    for row_idx in range(6, sheet.nrows):
+        try:
+            row = sheet.row_values(row_idx)
+            if not row[1]:
+                continue
+
+            # 解析HS编码
+            hs_code = parse_hscode(row[1])
+            if not hs_code:
+                print(f"[行{row_idx}] 忽略无效行:{row[1]}")
+                continue
+
+            # 读取国家数据
+            for col_idx in range(2, sheet.ncols, 2):
+                country_name = sheet.cell_value(3, col_idx).strip()
+                country_code = COUNTRY_CODE_MAPPING.get(country_name, 'XX')
+
+                month_val = parse_value(row[col_idx])
+                cumulative_val = parse_value(row[col_idx + 1]) if (col_idx + 1) < sheet.ncols else Decimal('0.0000')
+
+                params.append((
+                    year_month,
+                    hs_code,
+                    trade_type,
+                    country_code,
+                    country_name,
+                    month_val,
+                    cumulative_val
+                ))
+
+        except Exception as e:
+            print(f"行{row_idx}-{col_idx}处理失败:{str(e)}")
+            continue
+
+    # 批量写入数据库
+    try:
+        with conn.cursor() as cursor:
+            # 分批次提交[5,9](@ref)
+            for i in range(0, len(params), BATCH_SIZE):
+                batch = params[i:i + BATCH_SIZE]
+                sql = """
+                      INSERT INTO t_yujin_crossborder_commodity_country
+                      (`year_month`, hs_code, trade_type, country_code, country_name,
+                       month_amount, cumulative_amount)
+                      VALUES (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY \
+                      UPDATE \
+                          month_amount = \
+                      VALUES (month_amount), cumulative_amount = \
+                      VALUES (cumulative_amount) \
+                      """
+                cursor.executemany(sql, batch)
+                conn.commit()
+            return len(params)
+    except Error as e:
+        conn.rollback()
+        print(f"数据库操作失败:{str(e)}")
+        return 0
+
+
+def main():
+    """主处理流程"""
+    conn = None
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+
+        # 处理全年数据
+        for month in range(1, 13):
+            folder = f"downloads/{YEAR}"
+            print(f"⌛ 正在处理 {YEAR}-{month:02d} 数据...")
+
+            # 处理出口数据
+            export_count = process_month_file(conn, folder, month, "export")
+            # 处理进口数据
+            import_count = process_month_file(conn, folder, month, "import")
+
+            print(f"✅ {YEAR}-{month:02d} 完成 | 出口:{export_count}条 进口:{import_count}条")
+
+    except Error as e:
+        print(f"数据库连接失败:{str(e)}")
+    finally:
+        if conn and conn.open:
+            conn.close()
+        print("🏁 所有月份处理完成")
+
+
+if __name__ == "__main__":
+    main()

+ 126 - 0
quanguo/monthData.py

@@ -0,0 +1,126 @@
+import re
+
+import xlrd
+
+OUTPUT_SQL = "../downloads/trade_monthly04.sql"  # 输出文件名
+
+def is_2025_data(date_str):
+    """判断是否为2025年数据[1,5](@ref)"""
+    try:
+        return date_str.startswith('2025.')
+    except:
+        return False
+
+
+def convert_unit(value):
+    """亿元转万元,处理空值和异常[1,4](@ref)"""
+    try:
+        return round(float(value) * 10000, 4) if value not in ['-', ''] else None
+    except:
+        return None
+
+
+def parse_ratio(value):
+    """处理百分比数据[5](@ref)"""
+    return f"'{value}'" if value not in ['-', ''] else 'NULL'
+
+
+def generate_sql(data_group):
+    """动态生成SQL"""
+    # 基础数据行
+    base_row = data_group[0]
+    year_month = base_row[1].replace('.', '-')
+
+    # 月度数据转换
+    monthly_total = convert_unit(base_row[2])
+    monthly_import = convert_unit(base_row[4])  # 第5列是进口
+    monthly_export = convert_unit(base_row[3])  # 第4列是出口
+    trade_balance = convert_unit(base_row[5])
+    ytd_total = convert_unit(base_row[6])
+    ytd_import = convert_unit(base_row[8])
+    ytd_export = convert_unit(base_row[7])
+    ytd_balance = convert_unit(base_row[9])
+    # 初始化同比数据
+    yoy_data = {'import_export': 'NULL', 'import': 'NULL', 'export': 'NULL'}
+
+    # 处理2025年三行数据
+    if len(data_group) == 3:
+        # 同比数据在第2行的3-5列
+        yoy_row = data_group[1]
+        yoy_data = {
+            'import_export': parse_ratio(yoy_row[2]),  # 第4列
+            'import': parse_ratio(yoy_row[4]),  # 第5列
+            'export': parse_ratio(yoy_row[3])  # 第6列
+        }
+
+    return f"""
+INSERT INTO `t_yujin_crossborder_monthly_summary` 
+(`year_month`, `monthly_total`, `monthly_import`, `monthly_export`, `trade_balance`,
+`ytd_total`, `ytd_import`, `ytd_export`, `ytd_trade_balance`,
+`yoy_import_export`, `yoy_import`, `yoy_export`, `create_time`)
+VALUES (
+    '{year_month}',
+    {monthly_total or 'NULL'},
+    {monthly_import or 'NULL'},
+    {monthly_export or 'NULL'},
+    {trade_balance or 'NULL'},
+    {ytd_total or 'NULL'},
+    {ytd_import or 'NULL'},
+    {ytd_export or 'NULL'},
+    {ytd_balance or 'NULL'},
+    {yoy_data['import_export']},
+    {yoy_data['import']},
+    {yoy_data['export']},
+    NOW()
+);
+"""
+
+
+def main():
+    workbook = xlrd.open_workbook('D:/Downloads/2025051809220574256.xls')
+    sheet = workbook.sheet_by_index(0)
+
+    with open(OUTPUT_SQL, 'w', encoding='utf-8') as f:
+        pass
+
+
+    row_idx = 5  # 数据起始行
+    while row_idx < sheet.nrows:
+        try:
+            # 获取基础行
+            base_row = sheet.row_values(row_idx)
+            date_cell = str(base_row[1])
+
+            if not re.match(r"202[0-9]\.\d{2}", date_cell):  # 跳过无效行
+                row_idx += 1
+                continue
+
+            # 动态读取数据组
+            if is_2025_data(date_cell):
+                data_group = [
+                    base_row,
+                    sheet.row_values(row_idx + 1),
+                    sheet.row_values(row_idx + 2)
+                ]
+                step = 3
+            else:
+                data_group = [base_row]
+                step = 1
+
+            sql = generate_sql(data_group)
+            with open(OUTPUT_SQL, "a", encoding="utf-8") as f:
+                f.write(sql + "\n")
+            print(f"成功生成 {len(sql)} 条SQL语句,保存至:{OUTPUT_SQL}")
+            print(sql)
+            row_idx += step
+
+        except IndexError:
+            print(f"行{row_idx}数据不完整,已跳过")
+            row_idx += 1
+        except Exception as e:
+            print(f"处理行{row_idx}出错:{str(e)}")
+            row_idx += 1
+
+
+if __name__ == "__main__":
+    main()

+ 111 - 0
quanguo/monthData2023.py

@@ -0,0 +1,111 @@
+import re
+from decimal import Decimal, InvalidOperation
+
+import xlrd
+import pymysql
+from datetime import datetime
+
+# 数据库配置(需根据实际情况修改)
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+def convert_unit(value):
+    """亿元转万元,处理空值"""
+    try:
+        # 如果 value 不是特殊的无效值,进行转换并保留4位小数
+        return round(Decimal(value) * 10000, 4) if value not in ['-', ''] else None
+    except (InvalidOperation, ValueError):
+        # 捕获异常,返回 None
+        return None
+
+def get_upsert_sql():
+    """生成带覆盖更新的SQL模板"""
+    return """
+    INSERT INTO `t_yujin_crossborder_monthly_summary` 
+    (`year_month`, `monthly_total`, `monthly_import`, `monthly_export`, `trade_balance`,
+     `ytd_total`, `ytd_import`, `ytd_export`, `ytd_trade_balance`, `create_time`)
+    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+    ON DUPLICATE KEY UPDATE
+        monthly_total = VALUES(monthly_total),
+        monthly_import = VALUES(monthly_import),
+        monthly_export = VALUES(monthly_export),
+        trade_balance = VALUES(trade_balance),
+        ytd_total = VALUES(ytd_total),
+        ytd_import = VALUES(ytd_import),
+        ytd_export = VALUES(ytd_export),
+        ytd_trade_balance = VALUES(ytd_trade_balance),
+        create_time = VALUES(create_time)
+    """
+
+def main():
+    # 连接数据库[2,5](@ref)
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+        cursor = conn.cursor()
+    except pymysql.Error as e:
+        print(f"数据库连接失败: {e}")
+        return
+
+    # 读取Excel文件
+    try:
+        workbook = xlrd.open_workbook('../src/downloads/20250513/2025011810224811354 (1).xls')
+        sheet = workbook.sheet_by_index(0)
+    except Exception as e:
+        print(f"文件读取失败: {e}")
+        conn.close()
+        return
+
+    sql = get_upsert_sql()
+    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    processed = 0
+
+    # 数据遍历(从第6行开始)
+    for row_idx in range(5, sheet.nrows):
+        try:
+            base_row = sheet.row_values(row_idx)
+            date_cell = str(base_row[1])
+
+            # 验证2023年数据格式[6](@ref)
+            if not re.match(r"^2023\.\d{2}$", date_cell):
+                continue
+
+            # 数据转换
+            params = (
+                base_row[1].replace('.', '-'),  # year_month
+                convert_unit(base_row[2]),       # monthly_total
+                convert_unit(base_row[4]),       # monthly_import
+                convert_unit(base_row[3]),       # monthly_export
+                convert_unit(base_row[5]),       # trade_balance
+                convert_unit(base_row[6]),       # ytd_total
+                convert_unit(base_row[7]),       # ytd_import
+                convert_unit(base_row[8]),       # ytd_export
+                convert_unit(base_row[9]),       # ytd_balance
+                current_time                    # create_time
+            )
+
+            # 执行覆盖插入[6,7](@ref)
+            cursor.execute(sql, params)
+            processed += 1
+
+        except Exception as e:
+            print(f"处理行{row_idx}出错: {str(e)}")
+            conn.rollback()
+
+    # 提交事务
+    try:
+        conn.commit()
+        print(f"成功处理 {processed} 条数据,时间:{current_time}")
+    except pymysql.Error as e:
+        print(f"事务提交失败: {e}")
+    finally:
+        cursor.close()
+        conn.close()
+
+if __name__ == "__main__":
+    main()

+ 90 - 0
quanguo/pc.py

@@ -0,0 +1,90 @@
+import os
+import time
+import requests
+import schedule
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+
+
+def download_excel(url, save_path):
+    """下载Excel文件"""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Referer': 'https://www.google.com/'  # 模拟合法来源页[3,5](@ref)
+    }
+    response = requests.get(url, headers=headers)
+    with open(save_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=1024):
+            if chunk:
+                f.write(chunk)
+
+
+def find_and_download_monthly_data():
+    try:
+        # 1. 访问主页面
+        main_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
+        headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Encoding": "gzip, deflate",  # 需配合解压处理[1](@ref)
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
+            "Cache-Control": "max-age=0",
+            "Connection": "keep-alive",
+            "Cookie": "AV7KYchI7HHaS=5sC6lIXRxEGXW6dT63ZBwGHY4pma1LIP4nuaP5fqUi7S8d7D3nolW7IA9MoTWDQ8S8Pi6.uGvZmBHNYlJsClRVa;...",
+            # 完整复制浏览器Cookie
+            "Host": "www.customs.gov.cn",
+            "Referer": "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0"
+        }
+        response = requests.get(main_url, headers=headers)
+        print("res...{}",response)
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # 2. 定位目标行
+        target_row = None
+        for row in soup.select('tr'):
+            if '2025年进出口商品收发货人所在地总值表' in row.text:
+                target_row = row
+                break
+        if not target_row:
+            print("未找到目标表格行")
+            return
+
+        # 3. 遍历月份链接
+        month_links = []
+        for cell in target_row.find_all('a', href=True):
+            if any(str(m) in cell.text for m in range(1, 13)):
+                month_links.append(cell['href'])
+
+        # 4. 使用Selenium处理动态页面
+        driver = webdriver.Chrome()
+        for link in month_links:
+            full_url = requests.compat.urljoin(main_url, link)
+            driver.get(full_url)
+
+            # 查找下载按钮
+            download_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下载")]')
+            excel_url = download_btn.get_attribute('href')
+
+            # 保存文件
+            filename = f"{time.strftime('%Y%m')}_海关数据.xlsx"
+            download_excel(excel_url, os.path.join('../src/downloads', filename))
+
+        driver.quit()
+
+    except Exception as e:
+        print(f"发生错误: {str(e)}")
+
+
+# 设置每天上午9点执行
+schedule.every().day.at("09:00").do(find_and_download_monthly_data)
+
+if __name__ == "__main__":
+    # 创建下载目录
+    if not os.path.exists('../src/downloads'):
+        os.makedirs('../src/downloads')
+
+    find_and_download_monthly_data()

+ 140 - 0
quanguo/region_trade.py

@@ -0,0 +1,140 @@
+from decimal import Decimal
+
+import xlrd
+
+# 完整省级行政区划映射(GB/T 2260-2023)
+REGION_MAPPING = {
+    # 直辖市
+    "北京市": {"code": "110000", "type": "province"},
+    "天津市": {"code": "120000", "type": "province"},
+    "上海市": {"code": "310000", "type": "province"},
+    "重庆市": {"code": "500000", "type": "province"},
+
+    # 省份(23个)
+    "河北省": {"code": "130000", "type": "province"},
+    "山西省": {"code": "140000", "type": "province"},
+    "辽宁省": {"code": "210000", "type": "province"},
+    "吉林省": {"code": "220000", "type": "province"},
+    "黑龙江省": {"code": "230000", "type": "province"},
+    "江苏省": {"code": "320000", "type": "province"},
+    "浙江省": {"code": "330000", "type": "province"},
+    "安徽省": {"code": "340000", "type": "province"},
+    "福建省": {"code": "350000", "type": "province"},
+    "江西省": {"code": "360000", "type": "province"},
+    "山东省": {"code": "370000", "type": "province"},
+    "河南省": {"code": "410000", "type": "province"},
+    "湖北省": {"code": "420000", "type": "province"},
+    "湖南省": {"code": "430000", "type": "province"},
+    "广东省": {"code": "440000", "type": "province"},
+    "海南省": {"code": "460000", "type": "province"},
+    "四川省": {"code": "510000", "type": "province"},
+    "贵州省": {"code": "520000", "type": "province"},
+    "云南省": {"code": "530000", "type": "province"},
+    "陕西省": {"code": "610000", "type": "province"},
+    "甘肃省": {"code": "620000", "type": "province"},
+    "青海省": {"code": "630000", "type": "province"},
+    "台湾省": {"code": "710000", "type": "province"},
+
+    # 自治区(5个)
+    "内蒙古自治区": {"code": "150000", "type": "province"},
+    "广西壮族自治区": {"code": "450000", "type": "province"},
+    "西藏自治区": {"code": "540000", "type": "province"},
+    "宁夏回族自治区": {"code": "640000", "type": "province"},
+    "新疆维吾尔自治区": {"code": "650000", "type": "province"},
+
+    # 特别行政区
+    "香港特别行政区": {"code": "810000", "type": "province"},
+    "澳门特别行政区": {"code": "820000", "type": "province"}
+}
+
+
+def parse_numeric(value):
+    """增强型数值解析函数(处理空值和特殊符号)[1,8](@ref)"""
+    try:
+        cleaned = str(value).strip().replace(',', '')
+        if cleaned in ('-', '', 'NA', 'NaN'):
+            return Decimal('0.00')
+        return Decimal(cleaned).quantize(Decimal('0.00'))
+    except Exception as e:
+        print(f"数值解析失败:{value},错误:{str(e)}")
+        return Decimal('0.00')
+
+
+def generate_region_sql(input_file, output_file, year_month):
+    """生成行政区划贸易数据SQL脚本(支持.xls格式)[1,7](@ref)"""
+    workbook = xlrd.open_workbook(input_file)
+    sheet = workbook.sheet_by_index(0)
+
+    sql_scripts = []
+
+    for row_idx in range(6, sheet.nrows):
+        row = sheet.row_values(row_idx)
+        region_name = str(row[1]).strip() if len(row) > 1 else ""
+        if not region_name:
+            continue
+
+        # 处理地区简称(保持原有逻辑)
+        region_info = REGION_MAPPING.get(region_name) or next(
+            (v for k, v in REGION_MAPPING.items() if region_name in k), None
+        )
+
+        if not region_info:
+            print(f"警告:未找到地区编码映射 - {region_name}")
+            continue
+
+
+
+        data_fields = {
+            'monthly_total': parse_numeric(row[2]),
+            'monthly_export': parse_numeric(row[4]),
+            'monthly_import': parse_numeric(row[6]),
+            'ytd_total': parse_numeric(row[8]),
+            'ytd_export': parse_numeric(row[9]),
+            'ytd_import': parse_numeric(row[10])
+        }
+
+        # 生成SQL模板[5](@ref)
+        sql_template = f"""
+        INSERT INTO `t_yujin_crossborder_region_trade`
+        (`year_month`, `region_code`, `region_name`, `region_type`,
+         `monthly_total`, `monthly_import`, `monthly_export`,
+         `ytd_total`, `ytd_import`, `ytd_export`)
+        VALUES (
+            '{year_month}',
+            '{region_info['code']}',
+            '{region_name}',
+            '{region_info['type']}',
+            {data_fields['monthly_total']},
+            {data_fields['monthly_import']},
+            {data_fields['monthly_export']},
+            {data_fields['ytd_total']},
+            {data_fields['ytd_import']},
+            {data_fields['ytd_export']}
+        ) ON DUPLICATE KEY UPDATE
+            monthly_total = VALUES(monthly_total),
+            monthly_import = VALUES(monthly_import),
+            monthly_export = VALUES(monthly_export),
+            ytd_total = VALUES(ytd_total),
+            ytd_import = VALUES(ytd_import),
+            ytd_export = VALUES(ytd_export);
+        """
+        sql_scripts.append(sql_template)
+
+    # 写入SQL文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(sql_scripts))
+
+    return len(sql_scripts)
+
+
+# 执行示例(需提前安装xlrd==1.2.0)
+if __name__ == "__main__":
+    input_excel = r"D:/Downloads/2025051809411226553.xls"
+    output_sql = "../downloads/region_trade04.sql"
+    year_month = "2025-04"
+
+    try:
+        count = generate_region_sql(input_excel, output_sql, year_month)
+        print(f"成功生成{count}条SQL语句,已保存至{output_sql}")
+    except Exception as e:
+        print(f"生成失败:{str(e)}")

+ 190 - 0
quanguo/region_trade2024.py

@@ -0,0 +1,190 @@
+import os
+from decimal import Decimal
+
+import pymysql
+import xlrd
+from pymysql import Error
+
+YEAR = 2025
+
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+# 完整省级行政区划映射(GB/T 2260-2023)
+REGION_MAPPING = {
+    # 直辖市
+    "北京市": {"code": "110000", "type": "province"},
+    "天津市": {"code": "120000", "type": "province"},
+    "上海市": {"code": "310000", "type": "province"},
+    "重庆市": {"code": "500000", "type": "province"},
+
+    # 省份(23个)
+    "河北省": {"code": "130000", "type": "province"},
+    "山西省": {"code": "140000", "type": "province"},
+    "辽宁省": {"code": "210000", "type": "province"},
+    "吉林省": {"code": "220000", "type": "province"},
+    "黑龙江省": {"code": "230000", "type": "province"},
+    "江苏省": {"code": "320000", "type": "province"},
+    "浙江省": {"code": "330000", "type": "province"},
+    "安徽省": {"code": "340000", "type": "province"},
+    "福建省": {"code": "350000", "type": "province"},
+    "江西省": {"code": "360000", "type": "province"},
+    "山东省": {"code": "370000", "type": "province"},
+    "河南省": {"code": "410000", "type": "province"},
+    "湖北省": {"code": "420000", "type": "province"},
+    "湖南省": {"code": "430000", "type": "province"},
+    "广东省": {"code": "440000", "type": "province"},
+    "海南省": {"code": "460000", "type": "province"},
+    "四川省": {"code": "510000", "type": "province"},
+    "贵州省": {"code": "520000", "type": "province"},
+    "云南省": {"code": "530000", "type": "province"},
+    "陕西省": {"code": "610000", "type": "province"},
+    "甘肃省": {"code": "620000", "type": "province"},
+    "青海省": {"code": "630000", "type": "province"},
+    "台湾省": {"code": "710000", "type": "province"},
+
+    # 自治区(5个)
+    "内蒙古自治区": {"code": "150000", "type": "province"},
+    "广西壮族自治区": {"code": "450000", "type": "province"},
+    "西藏自治区": {"code": "540000", "type": "province"},
+    "宁夏回族自治区": {"code": "640000", "type": "province"},
+    "新疆维吾尔自治区": {"code": "650000", "type": "province"},
+
+    # 特别行政区
+    "香港特别行政区": {"code": "810000", "type": "province"},
+    "澳门特别行政区": {"code": "820000", "type": "province"}
+}
+
+
+def parse_numeric(value):
+    """数值解析(增强容错),保留四位小数"""
+    try:
+        cleaned = str(value).strip().replace(',', '').replace('—', '-')
+        if cleaned in ('-', '', 'NA', 'N/A'):
+            return Decimal('0.0000')
+        return Decimal(cleaned).quantize(Decimal('0.0000'))
+    except Exception as e:
+        print(f"数值解析失败:{value},错误:{str(e)}")
+        return Decimal('0.0000')
+
+
+def batch_upsert(conn, file_path, year_month):
+    """批量更新插入数据"""
+    try:
+        workbook = xlrd.open_workbook(file_path)
+        sheet = workbook.sheet_by_index(0)
+    except Exception as e:
+        print(f"文件读取失败:{file_path}\n错误:{str(e)}")
+        return 0
+
+    cursor = conn.cursor()
+    processed = 0
+
+    # 从第7行开始读取(索引6)
+    for row_idx in range(6, sheet.nrows):
+        try:
+            row = sheet.row_values(row_idx)
+            region_name = str(row[1]).strip()
+
+            # 跳过空行和标题行
+            if not region_name or region_name == "地区":
+                continue
+
+            # 精确匹配行政区划
+            region_info = REGION_MAPPING.get(region_name, None)
+            if not region_info:
+                # 尝试模糊匹配(处理名称变体)
+                region_info = next(
+                    (v for k in REGION_MAPPING if region_name in k), None)
+                if not region_info:
+                    # print(f"地区匹配失败:{region_name}")
+                    continue
+            print(f"处理地区 - {region_name}")
+            # 构建参数元组
+            params = (
+                year_month,
+                region_info['code'],  # region_code
+                region_name,
+                region_info['type'],
+                parse_numeric(row[2]),  # monthly_total
+                parse_numeric(row[4]),  # monthly_import
+                parse_numeric(row[6]),  # monthly_export
+                parse_numeric(row[8]),  # ytd_total
+                parse_numeric(row[9]),  # ytd_import
+                parse_numeric(row[10])  # ytd_export
+            )
+
+            # 执行插入更新
+            cursor.execute(
+                """
+                INSERT INTO `t_yujin_crossborder_region_trade`
+                (`year_month`, `region_code`, `region_name`, `region_type`,
+                 `monthly_total`,`monthly_export`, `monthly_import`, 
+                 `ytd_total`, `ytd_export`, `ytd_import`)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY
+                UPDATE
+                    region_name =
+                VALUES (region_name), region_type =
+                VALUES (region_type), monthly_total =
+                VALUES (monthly_total), monthly_import =
+                VALUES (monthly_import), monthly_export =
+                VALUES (monthly_export), ytd_total =
+                VALUES (ytd_total), ytd_import =
+                VALUES (ytd_import), ytd_export =
+                VALUES (ytd_export)
+                """, params)
+            processed += 1
+        except Exception as e:
+            print(f"行{row_idx}处理失败:{str(e)}")
+            continue
+
+    conn.commit()
+    cursor.close()
+    return processed
+
+
+def main():
+    """主执行流程"""
+    try:
+        # 建立数据库连接
+        conn = pymysql.connect(**DB_CONFIG)
+
+        # 按月份顺序处理
+        for month in range(1, 13):
+            # 构建文件路径
+            folder = f"downloads/{YEAR}/{month:02d}月"
+            file_name = f"(8){YEAR}年进出口商品收发货人所在地总值表.xls"
+            file_path = os.path.join(folder, file_name)
+            file_path = os.path.normpath(file_path)
+            # 检查文件存在性
+            if not os.path.isfile(file_path):
+                print(f"文件不存在,跳过:{file_path}")
+                continue
+
+            # 生成年月标识
+            year_month = f"{YEAR}-{month:02d}"
+
+            # print(f"⌛ 正在处理 {year_month} 数据...")
+            try:
+                count = batch_upsert(conn, file_path, year_month)
+                print(f"✅ 成功更新 {year_month} 数据,影响{count}条记录")
+            except Exception as e:
+                print(f"❌ {year_month} 处理失败:{str(e)}")
+                conn.rollback()
+
+    except Error as e:
+        print(f"数据库连接失败:{str(e)}")
+    finally:
+        if conn and conn.open:
+            conn.close()
+        print("🏁 所有月份处理完成")
+
+
+if __name__ == "__main__":
+    main()

+ 107 - 0
quanguo/yearData.py

@@ -0,0 +1,107 @@
+from decimal import Decimal, InvalidOperation
+
+import xlrd
+import pymysql
+from datetime import datetime
+
+# 数据库配置(需根据实际情况修改)
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+def convert_unit(value):
+    """亿元转万元,处理空值"""
+    try:
+        # 如果 value 不是特殊的无效值,进行转换并保留4位小数
+        return round(Decimal(value) * 10000, 4) if value not in ['-', ''] else None
+    except (InvalidOperation, ValueError):
+        # 捕获异常,返回 None
+        return None
+
+def parse_ratio(value):
+    """处理百分比数据"""
+    return value if value not in ['-', ''] else None
+
+def get_upsert_sql():
+    """生成带更新条件的SQL语句"""
+    return """
+    INSERT INTO t_yujin_crossborder_yearly_summary 
+    (year, year_total, year_import, year_export, trade_balance, 
+     yoy_import_export, yoy_import, yoy_export, create_time)
+    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+    ON DUPLICATE KEY UPDATE
+        year_total = VALUES(year_total),
+        year_import = VALUES(year_import),
+        year_export = VALUES(year_export),
+        trade_balance = VALUES(trade_balance),
+        yoy_import_export = VALUES(yoy_import_export),
+        yoy_import = VALUES(yoy_import),
+        yoy_export = VALUES(yoy_export),
+        create_time = VALUES(create_time)
+    """
+
+def main():
+    # 连接数据库
+    try:
+        conn = pymysql.connect(**DB_CONFIG)
+        cursor = conn.cursor()
+    except pymysql.Error as e:
+        print(f"数据库连接失败: {e}")
+        return
+
+    # 读取Excel文件
+    try:
+        workbook = xlrd.open_workbook('../src/downloads/20250513/(1)2025年进出口商品总值表_A年度表_3月.xls')
+        sheet = workbook.sheet_by_index(0)
+    except Exception as e:
+        print(f"文件读取失败: {e}")
+        conn.close()
+        return
+
+    sql = get_upsert_sql()
+    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    processed = 0
+
+    for row_idx in range(5, sheet.nrows):
+        row = sheet.row_values(row_idx)
+        if not row[1]:  # 跳过空年份
+            continue
+
+        # 准备数据
+        params = (
+            row[1],  # year
+            convert_unit(row[2]),  # year_total
+            convert_unit(row[4]),  # year_import
+            convert_unit(row[3]),  # year_export
+            convert_unit(row[5]),  # trade_balance
+            parse_ratio(row[6]),   # yoy_import_export
+            parse_ratio(row[7]),   # yoy_import
+            parse_ratio(row[8]),   # yoy_export
+            current_time          # create_time
+        )
+
+        # 执行数据库操作
+        try:
+            cursor.execute(sql, params)
+            processed += 1
+        except pymysql.Error as e:
+            print(f"数据插入失败(行{row_idx}): {e}")
+            conn.rollback()
+            break
+
+    try:
+        conn.commit()
+        print(f"成功处理 {processed} 条数据,当前时间:{current_time}")
+    except pymysql.Error as e:
+        print(f"事务提交失败: {e}")
+    finally:
+        cursor.close()
+        conn.close()
+
+if __name__ == "__main__":
+    main()

+ 231 - 0
selenium_download.py

@@ -0,0 +1,231 @@
+import os
+import re
+import time
+from pathlib import Path
+
+from faker import Faker
+from selenium import webdriver
+from selenium.common import StaleElementReferenceException
+from selenium.webdriver import FirefoxOptions, ActionChains
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+YEAR = 2025
+TARGET_TABLES = [
+    f"(2){YEAR}年进出口商品国别(地区)总值表",
+    f"(4){YEAR}年进出口商品类章总值表",
+    f"(8){YEAR}年进出口商品收发货人所在地总值表",
+    f"(15){YEAR}年对部分国家(地区)出口商品类章金额表",
+    f"(16){YEAR}年自部分国家(地区)进口商品类章金额表"
+]
+
+
+def wait_for_download_complete(download_dir, timeout=15):
+    """监控下载目录(包括子目录)变化实现下载等待"""
+    initial_files = set(Path(download_dir).rglob('*'))  # 使用 rglob 递归获取所有文件
+    start_time = time.time()
+
+    while (time.time() - start_time) < timeout:
+        current_files = set(Path(download_dir).rglob('*'))  # 同样使用 rglob 获取当前所有文件
+        new_files = current_files - initial_files  # 获取新增文件
+        if new_files:  # 如果有新文件
+            return max(new_files, key=lambda f: f.stat().st_ctime)  # 返回最新的下载文件
+        time.sleep(1)
+    raise TimeoutError("文件下载超时")
+
+
+from selenium.webdriver.common.by import By
+
+def process_table_row(row):
+    """动态处理表格行数据(Selenium语法)"""
+    try:
+        # 获取所有表格单元格(td)元素
+        cells = row.find_elements(By.TAG_NAME, 'td')
+        if len(cells) < 2:
+            return None
+
+        # 获取表格名
+        table_name = cells[0].text.strip()
+
+        # 获取第二列中的所有链接,提取月份和href
+        month_links = []
+        links = cells[1].find_elements(By.TAG_NAME, 'a')
+        for a in links:
+            # 获取文本并去掉‘月’
+            month_text = a.text
+            if '月' in month_text:
+                month = int(month_text.replace('月', '').strip())
+                href = a.get_attribute('href')
+                if href:
+                    month_links.append((month, href))
+
+        # 按月份升序排列(1-12月)
+        month_links.sort(key=lambda x: x[0])
+
+        return (table_name, month_links)
+
+    except Exception as e:
+        print(f"表格行处理异常: {str(e)}")
+        return None
+
+
+
+def download_monthly_data(driver, table_name, month_data):
+    """Selenium版单月数据下载[6,8](@ref)"""
+    month_num, link = month_data
+    safe_name = re.sub(r'[\\/*?:"<>|]', "", table_name).replace(' ', '_')
+
+    try:
+        download_dir = os.path.abspath(f"downloads/{YEAR}")
+        # initial_files = set(download_dir.glob('*'))
+
+        # 执行下载操作
+        driver.get(f"{link}")
+        download_btn = WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR,
+                                            'span.easysite-isprase a[href$=".xls"], span.easysite-isprase a[href$=".xlsx"]'))
+        )
+        # print(f"excel链接:{download_btn.get_attribute("outerHTML")}")
+        ActionChains(driver).move_to_element(download_btn).click().perform()
+
+        # 等待下载完成
+        downloaded_file = wait_for_download_complete(download_dir)
+
+        # 文件整理
+        target_dir = Path(f"{download_dir}/{month_num:02d}月")
+        target_dir.mkdir(parents=True, exist_ok=True)
+        # 构造最终文件路径
+        final_path = target_dir / f"{safe_name}{downloaded_file.suffix}"
+
+        # 覆盖处理逻辑
+        if final_path.exists():
+            try:
+                os.remove(final_path)  # 删除已有文件
+                # print(f"检测到旧文件,已删除:{final_path}")
+            except Exception as e:
+                print(f"文件删除失败:{str(e)}")
+                raise
+
+        downloaded_file.rename(final_path)
+
+        print(f"√ 成功下载:{final_path}")
+        return True
+    except Exception as e:
+        print(f"× 下载失败 {table_name} {month_num}月:{str(e)}")
+        driver.save_screenshot(f'error_{safe_name}_{month_num:02d}.png')
+        return False
+
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    download_dir = os.path.abspath(f"downloads/{YEAR}")
+
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略[7,8](@ref)
+        driver.get(url)
+        WebDriverWait(driver, 30).until(
+            lambda d: d.execute_script("return document.readyState === 'complete'")
+        )
+
+        while True:
+            # 动态获取当前有效行(每次循环重新查询)
+            try:
+                table = WebDriverWait(driver, 20).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
+                )
+                current_rows = table.find_elements(By.CSS_SELECTOR, "tr:not(:first-child)")
+
+                if not current_rows:
+                    print("所有表格处理完成")
+                    break
+
+                # 仅处理当前首行(避免批量失效)
+                row = current_rows[0]
+                result = process_table_row(row)
+
+                if result and result[0] in TARGET_TABLES:
+                    table_name, month_links = result
+                    print(f"\n开始处理表格:{table_name}")
+
+                    # 处理月份数据(关键修改点)
+                    handle_month_data(driver, table_name, month_links)
+
+                # 删除已处理行并验证DOM更新
+                driver.execute_script("arguments[0].remove()", row)
+                WebDriverWait(driver, 10).until(
+                    EC.staleness_of(row)  # 强制等待元素失效[2,7](@ref)
+                )
+
+            except StaleElementReferenceException:
+                print("检测到元素失效,自动刷新表格")
+                driver.refresh()
+                WebDriverWait(driver, 30).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
+                )
+
+    finally:
+        driver.quit()
+
+
+def handle_month_data(driver, table_name, month_links):
+    main_window = driver.current_window_handle
+    for idx, month_data in enumerate(month_links):
+        if 1 <= month_data[0] <= 12:
+            # 新标签页策略(防止主页面DOM变更)
+            driver.switch_to.window(main_window)
+            driver.execute_script(f"window.open('{month_data[1]}', '_blank_{idx}')")
+            driver.switch_to.window(driver.window_handles[-1])
+
+            # 下载逻辑
+            try:
+                if download_monthly_data(driver, table_name, month_data):
+                    print(f"{month_data[0]}月下载成功")
+            finally:
+                driver.close()
+                driver.switch_to.window(main_window)
+                WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, f"#yb{YEAR}RMB"))
+                )
+
+
+if __name__ == "__main__":
+    Path('downloads').mkdir(exist_ok=True)
+    target_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
+    crawl_with_selenium(target_url)
+    print("全年数据下载任务已完成")

+ 0 - 0
shandong/__init__.py


+ 192 - 0
shandong/selenium_shandong_download.py

@@ -0,0 +1,192 @@
+import argparse
+import random
+import time
+from datetime import datetime, timedelta
+
+from selenium import webdriver
+from selenium.common import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from db_helper import DBHelper
+from shandong.shandong_parse_excel import parse_excel
+from utils.constants import DOWNLOAD_DIR
+from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
+from utils.parse_utils import traverse_and_process
+
+# 基础配置
+
+MAX_RETRY = 3
+BASE_URL = "http://qingdao.customs.gov.cn/qingdao_customs/406535/fdzdgknr30/406514/406515/index.html"
+download_dir = DOWNLOAD_DIR / "shandong"
+
+
+
+def detect_latest_month(driver):
+    """三级回溯智能检测最新有效月份"""
+    driver.get(BASE_URL)
+    current_date = datetime.now()
+    for offset in range(0, 3):
+        check_date = current_date - timedelta(days=offset * 30)
+        check_year = check_date.year
+        check_month = check_date.month
+
+        target_title = f"{check_year}年{check_month}月山东省进出口主要国别(地区)总值"
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
+            )
+            print(f"已找到最新月份数据 {check_year}-{check_month}")
+            return check_year, check_month
+        except:
+            print(f"未找到 {target_title}")
+            continue
+    raise Exception("三个月内未找到有效数据")
+
+
+
+def process_month_data(driver, year, month):
+
+    required_titles = [
+        f"{year}年{month}月山东省进口20位主要商品总值",
+        f"{year}年{month}月山东省出口20位主要商品总值",
+        f"{year}年{month}月山东省各地市进出口总值",
+        f"{year}年{month}月山东省进出口主要国别(地区)总值"
+    ]
+
+    found_count = 0
+    links = driver.find_elements(By.XPATH, '//a[contains(@title,"山东省")]')
+
+    for link in links:
+        title = link.get_attribute("title")
+        if title in required_titles:
+            try:
+                url = link.get_attribute("href")
+                download_excel(driver, url, year, month, title, download_dir)
+                found_count += 1
+                time.sleep(random.uniform(0.5, 1.5))  # 下载间隔
+            except Exception as e:
+                print(f"下载 {title} 失败: {str(e)}")
+
+    print(f"本页找到{found_count}个有效表格")
+    return found_count
+
+
+def reverse_crawler(driver, target_months):
+    """逆向分页抓取核心(优化分页逻辑)"""
+    processed_months = set()
+    # target_months = [(2023, 5), (2023, 4)]
+    page = 1
+    for year, month in target_months:
+        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+
+        current_page = 1
+        found_tables = 0
+
+        while True:
+            # 智能等待页面稳定
+            random_sleep(base=2, variance=3)
+
+            try:
+                # 动态检测当前页面月份
+                # page_year, page_month = extract_page_date(driver)
+                print(f"当前页面:{driver.current_url}, 第{page}页")
+                #
+                # # 月份不匹配时中断循环
+                # if (page_year, page_month) != (year, month):
+                #     break
+
+                # 处理当前页面的表格数据
+                found = process_month_data(driver, year, month)
+                found_tables += found
+
+                # 完成四个表格采集
+                if found_tables >= 4:
+                    print(f"已完成{year}年{month}月全部表格采集")
+                    processed_months.add((year, month))
+                    break
+
+                print(f"第{page}页已采集表格数:{found_tables}/4,前往下一页采集")
+                # 分页操作(增强定位稳定性)
+                WebDriverWait(driver, 15).until(
+                    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
+                ).click()
+
+                current_page += 1
+                page += 1
+
+
+            except TimeoutException:
+                print(f"未找到更多分页,已采集表格数:{found_tables}/4")
+                break
+            except Exception as e:
+                print(f"分页异常:{str(e)}")
+                handle_retry(driver)  # 异常恢复函数
+                break
+
+    return processed_months
+
+
+
+def random_sleep(base=2, variance=5):
+    """智能随机等待"""
+    sleep_time = base + random.random() * variance
+    time.sleep(sleep_time)
+
+def handle_retry(driver):
+    """异常恢复处理"""
+    try:
+        driver.refresh()
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+        )
+        print("浏览器异常已恢复")
+    except:
+        print("需要人工干预的严重错误")
+        raise
+
+
+def main():
+    """主入口(优化参数处理逻辑)"""
+    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+    parser.add_argument('--year', type=int, default=None,
+                        help='终止年份(如2023),未指定时抓取最新两个月')
+    args = parser.parse_args()
+    driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
+    try:
+        # 智能检测最新有效月份
+        valid_year, valid_month = detect_latest_month(driver)
+        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+
+        # 生成目标序列
+        if args.year:
+            # 指定年份时:从最新月到目标年1月
+            target_months = generate_month_sequence(
+                start_year=valid_year,
+                start_month=valid_month,
+                end_year=args.year
+            )
+        else:
+            # 未指定年份时:取最近两个月
+            target_months = generate_month_sequence(valid_year, valid_month)
+
+        print(f"目标采集月份序列:{target_months}")
+        reverse_crawler(driver, target_months)
+        print(f"{len(target_months)}个月份数据已采集完毕")
+
+    finally:
+        driver.quit()
+        print("\n数据清洗入库中...")
+        traverse_and_process(download_dir, parse_excel, province_name="shandong")
+        print("\n山东省地级市数据同比更新中...")
+        db_helper = DBHelper()
+        db_helper.update_shandong_yoy()
+
+
+if __name__ == "__main__":
+    main()

+ 207 - 0
shandong/selenium_shandong_read.py

@@ -0,0 +1,207 @@
+import os
+import random
+import re
+import time
+from pathlib import Path
+from urllib.parse import urljoin
+
+import pandas as pd
+import requests
+from faker import Faker
+from selenium import webdriver
+from selenium.webdriver import FirefoxOptions, ActionChains
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+YEAR_MONTH = "2025年3月"
+TARGET_TITLES = [
+    f"{YEAR_MONTH}山东省进口20位主要商品总值",
+    f"{YEAR_MONTH}山东省出口20位主要商品总值",
+    f"{YEAR_MONTH}山东省各地市进出口总值",
+    f"{YEAR_MONTH}山东省进出口主要国别(地区)总值"
+]
+
+URL = "http://qingdao.customs.gov.cn/qingdao_customs/406535/fdzdgknr30/406514/406515/index.html"
+
+
+
+
+def process_table_row(row):
+    """动态处理表格行数据(Selenium语法)"""
+    try:
+        # 获取所有表格单元格(td)元素
+        cells = row.find_elements(By.TAG_NAME, 'td')
+        if len(cells) < 2:
+            return None
+
+        # 获取表格名
+        table_name = cells[0].text.strip()
+
+        # 获取第二列中的所有链接,提取月份和href
+        month_links = []
+        links = cells[1].find_elements(By.TAG_NAME, 'a')
+        for a in links:
+            # 获取文本并去掉‘月’
+            month_text = a.text
+            if '月' in month_text:
+                month = int(month_text.replace('月', '').strip())
+                href = a.get_attribute('href')
+                if href:
+                    month_links.append((month, href))
+
+        # 按月份升序排列(1-12月)
+        month_links.sort(key=lambda x: x[0])
+
+        return (table_name, month_links)
+
+    except Exception as e:
+        print(f"表格行处理异常: {str(e)}")
+        return None
+
+
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    download_dir = os.path.abspath(os.path.join('../../downloads', "2025"))
+    print("当前下载路径:", Path(download_dir).resolve())
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
+    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+
+def find_target_links(driver):
+    """定位目标列表项(网页7、8的XPath文本定位方案)"""
+    WebDriverWait(driver, 20).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
+    )
+
+    targets = []
+    # 使用XPath精准匹配标题文本
+    for title in TARGET_TITLES:
+        xpath = f'//ul[@class="conList_ul"]//a[@title="{title}"]'
+        link = WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.XPATH, xpath))
+        ).get_attribute("href")
+        targets.append((title, link))
+        time.sleep(random.uniform(1, 3))  # 随机延迟防检测
+    return targets
+
+
+def wait_for_download_complete(download_dir, timeout=60):  # 延长超时时间至60秒
+    temp_extensions = ('.part', '.crdownload')
+    start_time = time.time()
+
+    while (time.time() - start_time) < timeout:
+        current_files = set(Path(download_dir).rglob('*'))
+        # 过滤临时文件和未完成下载的文件
+        valid_files = {f for f in current_files if not f.name.endswith(temp_extensions)}
+
+        if valid_files:
+            try:
+                newest_file = max(valid_files, key=lambda f: f.stat().st_ctime)
+                with newest_file.open('rb') as test_file:  # 尝试读取文件
+                    return newest_file
+            except (PermissionError, IOError):
+                continue  # 文件仍被占用或未完成写入
+        time.sleep(1)
+    raise TimeoutError("文件下载超时")
+
+
+def read_remote_excel(url):
+    try:
+        # 发送HTTP请求获取文件流
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()  # 检查状态码
+
+        # 将二进制流转换为DataFrame
+        excel_data = pd.read_excel(
+            io=response.content,
+            engine='openpyxl'  # 必须指定引擎(网页6)
+        )
+        return excel_data
+    except requests.exceptions.RequestException as e:
+        print(f"远程读取失败: {str(e)}")
+        return None
+
+def download_excel(title ,driver, url):
+    """处理下载逻辑(网页7的新标签页策略)"""
+    main_window = driver.current_window_handle
+    # 新标签页打开(避免主页面DOM变化)
+    driver.execute_script(f"window.open('{url}')")
+    driver.switch_to.window(driver.window_handles[-1])
+
+    try:
+        # 等待下载按钮出现
+        excel_link = WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.XPATH, '//a[text()="表格下载" and contains(@href, ".xls")]'))
+        )
+        # 获取相对路径并转换为绝对URL(关键步骤[2,7](@ref))
+        relative_url = excel_link.get_attribute("href")
+        base_url = "http://qingdao.customs.gov.cn"  # 根据实际情况调整
+        absolute_url = urljoin(base_url, relative_url)
+
+        print(f"数据:{read_remote_excel(absolute_url)}")
+
+
+    finally:
+        driver.close()
+        driver.switch_to.window(main_window)
+
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略[7,8](@ref)
+        driver.get(url)
+
+        # 获取目标链接
+        targets = find_target_links(driver)
+
+        # 遍历下载
+        for title, url in targets:
+            print(f"正在处理:{title}")
+            download_excel(title, driver, url)
+            time.sleep(random.randint(5, 10))  # 大间隔防封禁
+
+
+    finally:
+        driver.quit()
+
+
+
+
+if __name__ == "__main__":
+    crawl_with_selenium(URL)
+    print(f"山东省{YEAR_MONTH}下载任务已完成")

+ 309 - 0
shandong/shandong_parse_excel.py

@@ -0,0 +1,309 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from db_helper import DBHelper
+from quanguo.CountryTrade import COUNTRY_CODE_MAPPING
+from utils.constants import DOWNLOAD_DIR
+from utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, \
+    extract_year_month_from_path, get_previous_month_dir, find_unmatched_countries, traverse_and_process
+
+# 常量配置
+PROV_CODE = "370000"
+PROV_NAME = "山东省"
+
+
+SHANDONG_CITY = {
+    "济南": "370100", "青岛": "370200", "淄博": "370300", "枣庄": "370400",
+    "东营": "370500", "烟台": "370600", "潍坊": "370700", "济宁": "370800",
+    "泰安": "370900", "威海": "371000", "日照": "371100", "临沂": "371300",
+    "德州": "371400", "聊城": "371500", "滨州": "371600", "菏泽": "371700"
+}
+download_dir = DOWNLOAD_DIR / "shandong"
+
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+
+
+def parse_excel(current_dir):
+    """主解析入口(优化为单参数模式)
+    Args:
+        current_dir (str): 当前月份数据目录(格式:/年份/省份/月份)
+    """
+    current_path = Path(current_dir)
+    year, month = extract_year_month_from_path(current_path)
+
+    try:
+        # 动态获取前月目录
+        prev_dir = get_previous_month_dir(current_path) if month != 1 else None
+
+        # 处理商品贸易数据
+        process_combined_trade(current_path, year, month, prev_dir)
+
+        # 处理地市贸易数据
+        current_file_path = next(current_path.glob("*地市*"), None)
+        prev_file_path = next(Path(prev_dir).glob("*地市*"), None) if prev_dir else None
+        if current_file_path:
+            process_region_trade(current_file_path, prev_file_path, year, month)
+
+        # 处理国别贸易数据(保持原有逻辑结构)
+        country_file = next(current_path.glob("*国别*"), None)
+        prev_country_file = next(Path(prev_dir).glob("*国别*"), None) if prev_dir else None
+        if country_file:
+            process_country_trade(country_file, prev_country_file, year, month)
+
+        print(f"{current_dir}数据已全部成功处理")
+    except Exception as e:
+        print(f"处理失败:{current_dir},错误:{str(e)}")
+        raise
+
+
+def process_combined_trade(current_dir, year, month, previous_dir=None):
+    """处理合并商品贸易数据(增强1月逻辑)"""
+    import_file = next(current_dir.glob("*进口20位主要商品总值*"), None)
+    export_file = next(current_dir.glob("*出口20位主要商品总值*"), None)
+    if not (import_file and export_file):
+        raise FileNotFoundError("缺少进口或出口文件")
+
+    # 读取当前月数据(保持原有逻辑)
+    current_data = read_trade_pair(import_file, export_file)
+
+    # 处理历史数据
+    prev_data = pd.DataFrame()
+    if previous_dir and month != 1:
+        prev_import = next(Path(previous_dir).glob("*进口20位主要商品总值*"), None)
+        prev_export = next(Path(previous_dir).glob("*出口20位主要商品总值*"), None)
+        if prev_import and prev_export:
+            prev_data = read_trade_pair(prev_import, prev_export)
+
+    # 计算逻辑优化
+    merged_data = current_data if month == 1 else calculate_monthly_values(current_data, prev_data)
+
+    # 保留原有数据库交互逻辑
+    db = DBHelper()
+    merged_data['commodity_code'] = merged_data['commodity_name'].apply(db.get_commodity_id)
+    valid_data = merged_data[merged_data['commodity_code'].notnull()].copy()
+
+    # 构建入库数据(保持原有字段结构)
+    valid_data['crossborder_year'] = year
+    valid_data['crossborder_year_month'] = f"{year}-{month:02d}"
+    valid_data['prov_code'] = PROV_CODE
+    valid_data['prov_name'] = PROV_NAME
+    valid_data['monthly_total'] = valid_data['monthly_import'] + valid_data['monthly_export']
+
+    # 入库逻辑保持不变
+    target_cols = [
+        'crossborder_year', 'crossborder_year_month', 'prov_code', 'prov_name',
+        'commodity_code', 'commodity_name', 'monthly_total', 'monthly_import', 'monthly_export'
+    ]
+    db.bulk_insert(
+        valid_data[target_cols],
+        't_yujin_crossborder_prov_commodity_trade',
+        conflict_columns=['crossborder_year_month', 'prov_code', 'commodity_code'],
+        update_columns=['monthly_total', 'monthly_import', 'monthly_export']
+    )
+
+
+def process_region_trade(current_file_path, prev_file_path, year, month):
+    """处理地市贸易数据(增强1月逻辑)"""
+    # 读取当前数据
+    current_df = pd.read_excel(
+        current_file_path,
+        skipfooter=1,
+        header=4,
+        names=['city_name', 'monthly_total', 'yoy_import_export',
+               'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import']
+    )
+
+    # 1月特殊处理
+    if month == 1:
+        df = current_df[['city_name', 'monthly_total',
+                         'monthly_export', 'monthly_import']].copy()
+        df['yoy_import_export'] = current_df['yoy_import_export']
+        df['yoy_export'] = current_df['yoy_export']
+        df['yoy_import'] = current_df['yoy_import']
+    else:
+        prev_df = pd.read_excel(
+            prev_file_path,
+            skipfooter=1,
+            header=4,
+            names=['city_name', 'monthly_total', 'yoy_import_export',
+                   'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import']
+        ) if prev_file_path else pd.DataFrame()
+
+        merged_df = pd.merge(
+            current_df,
+            prev_df,
+            on='city_name',
+            suffixes=('_current', '_prev')
+        )
+        df = pd.DataFrame({
+            'city_name': merged_df['city_name'],
+            'monthly_total': merged_df['monthly_total_current'] - merged_df['monthly_total_prev'],
+            'yoy_import_export': merged_df['yoy_import_export_current'],
+            'monthly_export': merged_df['monthly_export_current'] - merged_df['monthly_export_prev'],
+            'yoy_export': merged_df['yoy_export_current'],
+            'monthly_import': merged_df['monthly_import_current'] - merged_df['monthly_import_prev'],
+            'yoy_import': merged_df['yoy_import_current']
+        })
+
+    # 保留原有处理逻辑
+    df['city_code'] = df['city_name'].map(SHANDONG_CITY)
+    df['crossborder_year'] = year
+    df['crossborder_year_month'] = f"{year}-{month:02d}"
+    df['prov_code'] = PROV_CODE
+    df['prov_name'] = PROV_NAME
+
+    # 单位转换
+    for col in ['monthly_total', 'monthly_import', 'monthly_export']:
+        df[col] = df[col].apply(convert_wan_to_yuan)
+
+    db = DBHelper()
+    db.bulk_insert(
+        df,
+        't_yujin_crossborder_prov_region_trade',
+        conflict_columns=['crossborder_year_month', 'city_code'],
+        update_columns=['monthly_total', 'monthly_import', 'monthly_export',
+                        'yoy_import_export', 'yoy_import', 'yoy_export']
+    )
+
+
+def process_country_trade(current_file_path, prev_file_path, year, month):
+    """处理国别贸易数据(增强1月逻辑)"""
+    # 读取当前数据
+    current_df = read_with_header4(current_file_path)
+
+    current_df = current_df[~current_df['country_name'].str.contains('注:', na=False)]
+    current_df = current_df.dropna(subset=['country_name'])
+    current_df = current_df[current_df['country_name'].str.strip() != '']
+
+    # 1月特殊处理
+    if month == 1:
+        final_df = current_df.copy()
+        final_df[['monthly_total', 'monthly_export', 'monthly_import']] = \
+            current_df[['monthly_total', 'monthly_export', 'monthly_import']]
+    else:
+        prev_df = read_with_header4(prev_file_path)
+
+        prev_df = prev_df[~prev_df['country_name'].str.contains('注:', na=False)]
+        prev_df = prev_df.dropna(subset=['country_name'])
+        prev_df = prev_df[prev_df['country_name'].str.strip() != '']
+
+        merged_df = pd.merge(
+            current_df,
+            prev_df,
+            on='country_name',
+            suffixes=('_current', '_prev'),
+            how='inner'
+        )
+
+        merged_df['monthly_total'] = merged_df['monthly_total_current'] - merged_df['monthly_total_prev']
+        merged_df['monthly_export'] = merged_df['monthly_export_current'] - merged_df['monthly_export_prev']
+        merged_df['monthly_import'] = merged_df['monthly_import_current'] - merged_df['monthly_import_prev']
+        merged_df['yoy_import_export'] = merged_df['yoy_import_export_current']
+        merged_df['yoy_export'] = merged_df['yoy_export_current']
+        merged_df['yoy_import'] = merged_df['yoy_import_current']
+        final_df = merged_df[[
+            'country_name','monthly_total', 'monthly_import', 'monthly_export',
+            'yoy_import_export', 'yoy_import', 'yoy_export'
+        ]]
+
+    # 排除特殊国家(新增过滤逻辑)
+    final_df = final_df[
+        ~final_df['country_name'].str.contains('东盟|欧盟', na=False, regex=True)
+    ]
+    final_df['country_code'] = final_df['country_name'].map(COUNTRY_CODE_MAPPING)
+
+    find_unmatched_countries(final_df)
+
+    final_df['crossborder_year'] = year
+    final_df['crossborder_year_month'] = f"{year}-{month:02d}"
+    final_df['prov_code'] = PROV_CODE
+    final_df['prov_name'] = PROV_NAME
+
+    # 单位转换
+    for col in ['monthly_total', 'monthly_import', 'monthly_export']:
+        final_df[col] = final_df[col].apply(convert_wan_to_yuan)
+
+    db = DBHelper()
+    db.bulk_insert(
+        final_df,
+        't_yujin_crossborder_prov_country_trade',
+        conflict_columns=['crossborder_year_month', 'prov_code', 'country_code'],
+        update_columns=['monthly_total', 'monthly_import', 'monthly_export',
+                        'yoy_import_export', 'yoy_import', 'yoy_export']
+    )
+
+
+def read_with_header4(file_path):
+    # 第一阶段:读取原始数据(固定列范围)
+    raw_df = pd.read_excel(
+        file_path,
+        usecols="A:G",  # 强制读取前7列
+        header=None,  # 禁用自动表头识别
+        skipfooter=1
+    )
+
+    # 第二阶段:计算列偏移量
+    if raw_df.iloc[:, 0:2].isnull().all().all():  # 前两列全为空
+        col_offset = 2  # 从第三列开始(A3起始)
+    else:
+        col_offset = 0  # 默认从第一列开始(A1起始)
+
+    # 第三阶段:应用header=4逻辑
+    header_row = 4  # 保持原有header行位置
+    data_start_row = header_row + 1  # 数据起始行
+
+    # 重新读取有效数据
+    final_df = pd.read_excel(
+        file_path,
+        usecols=raw_df.columns[col_offset:col_offset + 7],  # 动态列范围
+        header=header_row,
+        skipfooter=1
+    )
+
+    # 第四阶段:强制列名对齐
+    final_df.columns = [
+        'country_name', 'monthly_total', 'yoy_import_export',
+        'monthly_export', 'yoy_export', 'monthly_import', 'yoy_import'
+    ]
+
+    # 清洗国家名称
+    final_df['country_name'] = final_df['country_name'].apply(clean_county_name)
+
+    return final_df
+
+
+def read_trade_pair(import_path, export_path):
+    """进/出口表格合并"""
+    df_import = pd.read_excel(import_path, skiprows=3, skipfooter=1,
+                              usecols=[0, 1], names=["commodity_name", "monthly_import"]).pipe(lambda df: df.assign(
+        commodity_name=df["commodity_name"].apply(clean_commodity_name)
+    ))
+
+    df_export = pd.read_excel(export_path, skiprows=3, skipfooter=1,
+                              usecols=[0, 1], names=["commodity_name", "monthly_export"]).pipe(lambda df: df.assign(
+        commodity_name=df["commodity_name"].apply(clean_commodity_name)
+    ))
+    merged = pd.merge(df_import, df_export, on="commodity_name", how="outer").fillna(0)
+    merged["monthly_import"] = merged["monthly_import"].apply(convert_wan_to_yuan)
+    merged["monthly_export"] = merged["monthly_export"].apply(convert_wan_to_yuan)
+    return merged
+
+
+def calculate_monthly_values(current_data, prev_data):
+    """根据上个月进出口数据计算当月数据"""
+    merged = pd.merge(current_data, prev_data, on="commodity_name",
+                      how="left", suffixes=("_current", "_prev")).fillna(0)
+    merged["monthly_import"] = merged["monthly_import_current"] - merged["monthly_import_prev"]
+    merged["monthly_export"] = merged["monthly_export_current"] - merged["monthly_export_prev"]
+    return merged[["commodity_name", "monthly_import", "monthly_export"]]
+
+
+
+if __name__ == "__main__":
+    # traverse_and_process(download_dir, parse_excel, province_name="shandong")
+    print("\n山东省地级市数据同比更新中...")
+    db_helper = DBHelper()
+    db_helper.update_shandong_yoy()

+ 0 - 0
utils/__init__.py


+ 332 - 0
utils/constants.py

@@ -0,0 +1,332 @@
+import os
+from pathlib import Path
+
+DOWNLOAD_DIR = Path(os.getenv("DOWNLOAD_DIR", "../downloads")).resolve()
+
+
+
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区",
+                   "国家(地区)不明", "东盟(10国)", "欧盟(27国,不含英国)",
+                   "大洋洲其他国家(地区)", "拉丁美洲其他国家(地区)","非洲其他国家(地区)",
+                   "欧洲其他国家(地区)","拉丁美洲和加勒比(简称拉美)", "加勒比(UN)", "荷属安地列斯",
+                   "塞尔维亚和黑山"]
+
+
+
+COUNTRY_CODE_MAPPING = {
+    # ================= 亚洲 =================
+    "阿富汗": "AF",
+    "巴林": "BH",
+    "孟加拉国": "BD",
+    "不丹": "BT",
+    "文莱": "BN",
+    "缅甸": "MM",
+    "柬埔寨": "KH",
+    "塞浦路斯": "CY",
+    "朝鲜": "KP",
+    "中国香港": "HK",
+    "印度": "IN",
+    "印度尼西亚": "ID",
+    "伊朗": "IR",
+    "伊拉克": "IQ",
+    "以色列": "IL",
+    "日本": "JP",
+    "约旦": "JO",
+    "科威特": "KW",
+    "老挝": "LA",
+    "黎巴嫩": "LB",
+    "中国澳门": "MO",
+    "马来西亚": "MY",
+    "马尔代夫": "MV",
+    "蒙古": "MN",
+    "尼泊尔": "NP",
+    "阿曼": "OM",
+    "巴基斯坦": "PK",
+    "巴勒斯坦": "PS",
+    "菲律宾": "PH",
+    "卡塔尔": "QA",
+    "沙特阿拉伯": "SA",
+    "新加坡": "SG",
+    "韩国": "KR",
+    "斯里兰卡": "LK",
+    "叙利亚": "SY",
+    "泰国": "TH",
+    "土耳其": "TR",
+    "阿联酋": "AE",
+    "也门": "YE",
+    "越南": "VN",
+    "中国": "CN",
+    "中国台湾": "TW",
+    "哈萨克斯坦": "KZ",
+    "吉尔吉斯斯坦": "KG",
+    "塔吉克斯坦": "TJ",
+    "土库曼斯坦": "TM",
+    "乌兹别克斯坦": "UZ",
+    "格鲁吉亚": "GE",
+    "亚美尼亚": "AM",
+    "阿塞拜疆": "AZ",
+
+    # ================= 非洲 =================
+    "阿尔及利亚": "DZ",
+    "安哥拉": "AO",
+    "贝宁": "BJ",
+    "博茨瓦纳": "BW",
+    "布隆迪": "BI",
+    "喀麦隆": "CM",
+    "佛得角": "CV",
+    "中非": "CF",
+    "乍得": "TD",
+    "科摩罗": "KM",
+    "刚果共和国": "CG",
+    "吉布提": "DJ",
+    "埃及": "EG",
+    "赤道几内亚": "GQ",
+    "埃塞俄比亚": "ET",
+    "加蓬": "GA",
+    "冈比亚": "GM",
+    "加纳": "GH",
+    "几内亚": "GN",
+    "几内亚比绍": "GW",
+    "科特迪瓦": "CI",
+    "肯尼亚": "KE",
+    "莱索托": "LS",
+    "利比里亚": "LR",
+    "利比亚": "LY",
+    "马达加斯加": "MG",
+    "马拉维": "MW",
+    "马里": "ML",
+    "毛里塔尼亚": "MR",
+    "毛里求斯": "MU",
+    "摩洛哥": "MA",
+    "莫桑比克": "MZ",
+    "纳米比亚": "NA",
+    "尼日尔": "NE",
+    "尼日利亚": "NG",
+    "卢旺达": "RW",
+    "圣多美和普林西比": "ST",
+    "塞内加尔": "SN",
+    "塞舌尔": "SC",
+    "塞拉利昂": "SL",
+    "索马里": "SO",
+    "南非": "ZA",
+    "苏丹": "SD",
+    "坦桑尼亚": "TZ",
+    "多哥": "TG",
+    "突尼斯": "TN",
+    "乌干达": "UG",
+    "布基纳法索": "BF",
+    "刚果民主共和国": "CD",
+    "赞比亚": "ZM",
+    "津巴布韦": "ZW",
+    "厄立特里亚": "ER",
+    "南苏丹": "SS",
+
+    # ================= 欧洲 =================
+    "比利时": "BE",
+    "丹麦": "DK",
+    "英国": "GB",
+    "德国": "DE",
+    "法国": "FR",
+    "爱尔兰": "IE",
+    "意大利": "IT",
+    "卢森堡": "LU",
+    "荷兰": "NL",
+    "希腊": "GR",
+    "葡萄牙": "PT",
+    "西班牙": "ES",
+    "阿尔巴尼亚": "AL",
+    "奥地利": "AT",
+    "保加利亚": "BG",
+    "芬兰": "FI",
+    "匈牙利": "HU",
+    "冰岛": "IS",
+    "列支敦士登": "LI",
+    "马耳他": "MT",
+    "挪威": "NO",
+    "波兰": "PL",
+    "罗马尼亚": "RO",
+    "瑞典": "SE",
+    "瑞士": "CH",
+    "爱沙尼亚": "EE",
+    "拉脱维亚": "LV",
+    "立陶宛": "LT",
+    "白俄罗斯": "BY",
+    "摩尔多瓦": "MD",
+    "俄罗斯": "RU",
+    "乌克兰": "UA",
+    "斯洛文尼亚": "SI",
+    "克罗地亚": "HR",
+    "捷克": "CZ",
+    "斯洛伐克": "SK",
+    "北马其顿": "MK",
+    "波斯尼亚和黑塞哥维那": "BA",
+    "梵蒂冈": "VA",
+    "塞尔维亚": "RS",
+    "黑山": "ME",
+
+    # ================= 美洲 =================
+    "安提瓜和巴布达": "AG",
+    "阿根廷": "AR",
+    "巴哈马": "BS",
+    "巴巴多斯": "BB",
+    "伯利兹": "BZ",
+    "玻利维亚": "BO",
+    "巴西": "BR",
+    "加拿大": "CA",
+    "智利": "CL",
+    "哥伦比亚": "CO",
+    "哥斯达黎加": "CR",
+    "古巴": "CU",
+    "多米尼克": "DM",
+    "多米尼加": "DO",
+    "厄瓜多尔": "EC",
+    "萨尔瓦多": "SV",
+    "格林纳达": "GD",
+    "危地马拉": "GT",
+    "圭亚那": "GY",
+    "海地": "HT",
+    "洪都拉斯": "HN",
+    "牙买加": "JM",
+    "墨西哥": "MX",
+    "尼加拉瓜": "NI",
+    "巴拿马": "PA",
+    "巴拉圭": "PY",
+    "秘鲁": "PE",
+    "圣卢西亚": "LC",
+    "圣文森特和格林纳丁斯": "VC",
+    "苏里南": "SR",
+    "特立尼达和多巴哥": "TT",
+    "美国": "US",
+    "乌拉圭": "UY",
+    "委内瑞拉": "VE",
+    "圣基茨和尼维斯": "KN",
+
+    # ================= 大洋洲 =================
+    "澳大利亚": "AU",
+    "斐济": "FJ",
+    "基里巴斯": "KI",
+    "马绍尔群岛": "MH",
+    "密克罗尼西亚联邦": "FM",
+    "瑙鲁": "NR",
+    "新西兰": "NZ",
+    "帕劳": "PW",
+    "巴布亚新几内亚": "PG",
+    "萨摩亚": "WS",
+    "所罗门群岛": "SB",
+    "汤加": "TO",
+    "图瓦卢": "TV",
+    "瓦努阿图": "VU",
+
+    # ================= 特殊地区 =================
+    "法属圭亚那": "GF",
+    "瓜德罗普": "GP",
+    "留尼汪": "RE",
+    "圣马丁": "MF",
+    "荷属圣马丁": "SX",
+    "法属波利尼西亚": "PF",
+    "新喀里多尼亚": "NC",
+    "库克群岛": "CK",
+    "关岛": "GU",
+    "波多黎各": "PR",
+    "美属萨摩亚": "AS",
+    "百慕大": "BM",
+    "开曼群岛": "KY",
+    "福克兰群岛(马尔维纳斯)": "FK",
+    "格陵兰": "GL",
+    "法属南方领地": "TF",
+    "赫德岛和麦克唐纳岛": "HM",
+    "托克劳": "TK",
+    "纽埃": "NU",
+    "诺福克岛": "NF",
+    "北马里亚纳群岛": "MP",
+    "皮特凯恩": "PN",
+    "圣赫勒拿": "SH",
+    "斯瓦尔巴群岛和扬马延岛": "SJ",
+    "东帝汶": "TL",
+    # ==== 欧洲特殊地区 ====
+    "加那利群岛": "IC",  # 西班牙特殊领土代码
+    "塞卜泰(休达)": "XC",  # 休达官方代码
+    "梅利利亚": "XL",  # 梅利利亚官方代码
+    "安道尔": "AD",
+    "直布罗陀": "GI",
+    "摩纳哥": "MC",
+    "圣马力诺": "SM",
+    "法罗群岛": "FO",  # 丹麦自治领
+    "奥兰群岛": "AX",  # 芬兰自治省
+    "格恩西": "GG",  # 英国皇家属地
+    "马恩岛": "IM",
+    "泽西": "JE",
+
+    # ==== 非洲特殊地区 ====
+    "西撒哈拉": "EH",  # 争议地区代码
+    "斯威士兰": "SZ",  # 正式国名为"Eswatini"但保留旧映射
+    "马约特": "YT",  # 法国海外省
+
+    # ==== 美洲特殊地区 ====
+    "英属印度洋领地": "IO",
+    "阿鲁巴": "AW",
+    "库拉索": "CW",
+    "马提尼克": "MQ",  # 法国海外省
+    "蒙特塞拉特": "MS",
+    "法属圣马丁": "MF",
+    "特克斯和凯科斯群岛": "TC",
+    "英属维尔京群岛": "VG",
+    "博纳尔,圣俄斯塔休斯和萨巴": "BQ",
+    "圣巴泰勒米": "BL",  # 法国海外集体
+    "美属维尔京群岛": "VI",
+    "安圭拉": "AI",
+    "圣皮埃尔和密克隆": "PM",
+    "阿拉伯联合酋长国": "AE",
+    # ==== 大洋洲特殊地区 ====
+    "瓦利斯和富图纳": "WF",
+    "科科斯(基林)群岛": "CC",
+    "圣诞岛": "CX",
+    "美国本土外小岛屿": "UM",
+
+    # ==== 特殊标记 ====
+    "布维岛": "BV",  # 挪威属地
+    "南乔治亚岛和南桑德韦奇岛": "GS",
+    "国家(地区)不明": "XX"  # 自定义代码
+}
+
+GUANGDONG_CITY = {
+  "广州市": "440100",
+  "韶关市": "440200",
+  "深圳市": "440300",
+  "珠海市": "440400",
+  "汕头市": "440500",
+  "佛山市": "440600",
+  "江门市": "440700",
+  "湛江市": "440800",
+  "茂名市": "440900",
+  "肇庆市": "441200",
+  "惠州市": "441300",
+  "梅州市": "441400",
+  "汕尾市": "441500",
+  "河源市": "441600",
+  "阳江市": "441700",
+  "清远市": "441800",
+  "东莞市": "441900",
+  "中山市": "442000",
+  "潮州市": "445100",
+  "揭阳市": "445200",
+  "云浮市": "445300"
+}
+
+GUANGDONG_CUSTOMS_URL = {
+    "广州海关": "http://guangzhou.customs.gov.cn/guangzhou_customs/381558/fdzdgknr33/381638/381572/381573/index.html",
+    "深圳海关": "http://shenzhen.customs.gov.cn/shenzhen_customs/zfxxgk15/2966748/hgtj40/index.html",
+    # "拱北海关": "http://gongbei.customs.gov.cn/gongbei_customs/374280/fdzdgknr19/374301/index.html",
+    "汕头海关": "http://shantou.customs.gov.cn/shantou_customs/zfxxgk39/3008252/3008606/596222/index.html",
+    "黄埔海关": "http://huangpu.customs.gov.cn/huangpu_customs/zfxxgk35/2969690/2969697/tjsj/index.html",
+    "江门海关": "http://jiangmen.customs.gov.cn/jiangmen_customs/536578/fdzdgknr7/536580/index.html",
+    "湛江海关": "http://zhanjiang.customs.gov.cn/zhanjiang_customs/534855/zfxxgkzn24/534857/index.html"
+}
+# "中山市2025年1-4月对外贸易进出口统计表"
+# "珠海市2025年1-4月对外贸易进出口统计表"
+# "5市报表2025年1-4月(人民币)"
+
+
+

+ 216 - 0
utils/download_utils.py

@@ -0,0 +1,216 @@
+import threading
+import time
+import traceback
+from pathlib import Path
+
+from faker import Faker
+from selenium.webdriver import FirefoxOptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+DOWNLOAD_TIMEOUT = 60
+
+download_lock = threading.Lock()
+
+def configure_stealth_options(download_dir):
+    """反检测浏览器配置"""
+    opts = FirefoxOptions()
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+    opts.add_argument("--headless")
+    opts.set_preference("general.useragent.override", Faker().user_agent())
+    opts.set_preference("browser.download.dir",  str(download_dir))
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.ms-excel")
+    opts.set_preference("pdfjs.disabled", True)
+    opts.set_preference("download.prompt_for_download", False)
+    opts.set_preference("download.directory_upgrade", True)
+    return opts
+
+
+def generate_month_sequence(start_year, start_month, end_year=None, skip_january=False):
+    """
+    动态生成倒序月份序列
+
+    Args:
+        start_year (int): 检测到的最新数据年份
+        start_month (int): 检测到的最新数据月份
+        end_year (int, optional): 终止年份(默认 None)
+        skip_january (bool, optional): 是否跳过所有1月数据(默认 False)
+
+    Returns:
+        List[Tuple[int, int]]: 月份序列列表,格式为 [(year, month), ...]
+    """
+    sequence = []
+    current_year = start_year
+    current_month = start_month
+
+    # 当指定终止年份时
+    if end_year:
+        while not (current_year == end_year and current_month < 1):
+            # 跳过1月判断
+            if not (skip_january and current_month == 1):
+                sequence.append((current_year, current_month))
+
+            # 跨年处理
+            if current_month == 1:
+                current_year -= 1
+                current_month = 12
+            else:
+                current_month -= 1
+
+            # 终止条件:到达目标年份的1月
+            if current_year < end_year:
+                break
+    else:
+        # 未指定年份时取最近两个月
+        if not (skip_january and current_month == 1):
+            sequence.append((current_year, current_month))
+
+        prev_year, prev_month = get_previous_month(current_year, current_month)
+        if not (skip_january and prev_month == 1):
+            sequence.append((prev_year, prev_month))
+
+    return sequence
+
+
+def get_previous_month(year, month):
+    """跨年月份计算"""
+    if month == 1:
+        return year - 1, 12
+    return year, month - 1
+
+def download_excel(driver, url, year, month, title, download_dir):
+    """文件下载模块"""
+
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    driver.execute_script(f"window.open('{url}')")
+    driver.switch_to.window(driver.window_handles[-1])
+
+    try:
+        download_btn = WebDriverWait(driver, 20).until(
+            EC.element_to_be_clickable(
+                (By.XPATH,
+                 '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
+            )
+        )
+        download_btn.click()
+        # print(f"√ 已点击下载按钮:{download_btn.get_attribute("href")}")
+
+        downloaded_file  = wait_for_download(download_dir)
+        final_path = Path(f'{download_dir}/{year}/{month:02d}/{title}{downloaded_file.suffix}')
+        if final_path.exists():
+            final_path.unlink()
+        download_rel_dir = Path(f'{download_dir}/{year}/{month:02d}')
+        download_rel_dir.mkdir(parents=True, exist_ok=True)
+        downloaded_file.rename(final_path)
+        print(f"√ 文件已保存至:{final_path}")
+    finally:
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+
+def batch_download_excel(driver, url, year, month, base_title, download_dir):
+    """批量下载Excel文件模块"""
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    driver.execute_script(f"window.open('{url}')")
+    driver.switch_to.window(driver.window_handles[-1])
+
+    try:
+        # 获取所有Excel下载按钮
+        download_btns = WebDriverWait(driver, 20).until(
+            EC.presence_of_all_elements_located(
+                (By.XPATH,
+                 '//a[substring(@href, string-length(@href)-3) = ".xls" or substring(@href, string-length(@href)-4) = ".xlsx"]')
+            )
+        )
+
+        for index, btn in enumerate(download_btns):
+            # 生成唯一标题(可自定义规则)
+            title = f"{base_title}_{index + 1}"
+
+            # 点击下载按钮
+            btn.click()
+
+            # 等待下载完成
+            downloaded_file = wait_for_download(download_dir)
+
+            # 处理文件路径
+            final_dir = download_dir / f'{year}' / f'{month:02d}'
+            final_dir.mkdir(parents=True, exist_ok=True)
+            final_path = final_dir / f'{title}{downloaded_file.suffix}'
+
+            # 重命名文件
+            if final_path.exists():
+                final_path.unlink()
+            downloaded_file.rename(final_path)
+            print(f"√ 文件 {title} 已保存至:{final_path}")
+
+    finally:
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+
+def download_excel2(driver, link, year, month, title, download_dir):
+    download_dir = Path(download_dir)
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        print(f"正在点击链接:{title}")
+        link.click()
+        print("等待文件下载完成...")
+        downloaded_file = wait_for_download(download_dir)
+
+        if not downloaded_file.suffix:
+            downloaded_file = downloaded_file.with_suffix('.xlsx')
+
+        final_dir = download_dir / f'{year}' / f'{month:02d}'
+        final_dir.mkdir(parents=True, exist_ok=True)
+        final_path = final_dir / f'{title}{downloaded_file.suffix}'
+
+        if final_path.exists():
+            final_path.unlink()
+        downloaded_file.rename(final_path)
+        print(f"√ 文件已保存至:{final_path}")
+
+    except TimeoutError as te:
+        print(f"[错误] 文件下载超时:{te}")
+        raise
+    except Exception as e:
+        print(f"[错误] 发生异常:{e}")
+        print(traceback.format_exc())
+        raise
+
+
+# def wait_for_download(directory):
+#     """文件下载监控(只读取文件,忽略文件夹)"""
+#     start_time = time.time()
+#     while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
+#         files = [
+#             f for f in directory.glob('*')
+#             if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
+#         ]
+#         if files:
+#             # 按照创建时间排序并返回最新文件
+#             return max(files, key=lambda x: x.stat().st_ctime)
+#         time.sleep(1)
+#     raise TimeoutError("文件下载超时")
+
+
+def wait_for_download(directory):
+    """文件下载监控(只读取文件,忽略文件夹)"""
+    start_time = time.time()
+    while (time.time() - start_time) < DOWNLOAD_TIMEOUT:
+        with download_lock:
+            files = [
+                f for f in directory.glob('*')
+                if f.is_file() and not f.name.endswith(('.part', '.crdownload'))
+            ]
+            if files:
+                # 按照创建时间排序并返回最新文件
+                return max(files, key=lambda x: x.stat().st_ctime)
+        time.sleep(1)
+    raise TimeoutError("文件下载超时")

+ 25 - 0
utils/log.py

@@ -0,0 +1,25 @@
+import logging
+
+# 创建日志记录器
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
+
+# 创建文件处理器
+file_handler = logging.FileHandler('cross.log')
+file_handler.setLevel(logging.INFO)
+
+# 创建控制台处理器
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+
+# 格式化日志
+formatter = logging.Formatter(
+    '%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+file_handler.setFormatter(formatter)
+console_handler.setFormatter(formatter)
+
+# 添加处理器到记录器
+log.addHandler(file_handler)
+log.addHandler(console_handler)

+ 136 - 0
utils/parse_utils.py

@@ -0,0 +1,136 @@
+import re
+from decimal import Decimal
+from pathlib import Path
+
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+
+def clean_commodity_name(name):
+    """清洗商品名称中的特殊字符和括号注释,并替换英文括号为中文括号"""
+    if not isinstance(name, str):
+        return name
+    # 去除非文字字符:星号、连续空格等
+    name = re.sub(r"[\\*#]", "", name)
+    # 删除中英文括号及其包含的内容,如(已加旧码)或(2023版)
+    name = re.sub(r'[((]已加旧码[))]', '', name)
+    # 标准化空格:合并连续空格并去除首尾空格
+    name = re.sub(r'\s+', ' ', name).strip()
+    # 替换英文括号为中文括号
+    name = re.sub(r'\(', '(', name)
+    name = re.sub(r'\)', ')', name)
+    name = re.sub(r'\[', '【', name)
+    name = re.sub(r'\]', '】', name)
+
+    return name
+
+def clean_county_name(name):
+    """清洗国家名称中的特殊字符和括号注释,并替换英文括号为中文括号"""
+    if not isinstance(name, str):
+        return name
+
+    # 去除非文字字符
+    name = re.sub(r"[*]", "", name)
+    name = re.sub(r'[((]已加旧码[))]', '', name)
+    name = re.sub(r'[((]含旧码[))]', '', name)
+
+    # 删除“其中:”等关键词
+    name = re.sub(r"其中:", "", name)
+
+    # 🧠 新增逻辑:删除所有空格(包括中间空格)
+    name = re.sub(r'\s+', '', name)
+
+    return name.strip()
+
+def convert_wan_to_yuan(value):
+    return float(Decimal(str(value)).quantize(Decimal('0.0000')) * Decimal('10000'))
+
+
+def find_unmatched_countries(final_df):
+    # 创建一个布尔掩码,判断 'country_code' 列是否为 NaN
+    unmatched_mask = final_df['country_code'].isnull()
+
+    # 如果有未匹配的国家
+    if unmatched_mask.any():
+        # 获取未匹配国家的名称
+        unmatched_names = final_df.loc[unmatched_mask, 'country_name'].unique()
+
+        # 输出警告信息
+        print("⚠️ 以下国家名称未在 COUNTRY_CODE_MAPPING 中找到匹配:")
+
+        # 打印所有未匹配的国家名称,按字母排序
+        for name in sorted(unmatched_names):
+            print(f"   - {name}")
+
+def extract_year_month_from_path(path):
+    parts = path.parts
+    try:
+        year_part = parts[-2]
+        month_part = parts[-1]
+        if not YEAR_PATTERN.match(year_part):
+            raise ValueError(f"无效年份格式:{year_part}")
+        if not MONTH_PATTERN.match(month_part):
+            raise ValueError(f"无效月份格式:{month_part}")
+        return int(year_part), int(month_part)
+    except IndexError:
+        raise ValueError("路径结构不符合要求,示例:.../shandong/2025/04")
+
+#获取上月目录
+def get_previous_month_dir(current_path):
+    """生成前月目录路径"""
+    try:
+        year_part = current_path.parent.name
+        month_part = current_path.name
+
+        if not (YEAR_PATTERN.match(year_part) and MONTH_PATTERN.match(month_part)):
+            return None
+
+        prev_month = int(month_part) - 1
+        if prev_month < 1:
+            return None
+
+        return current_path.parent.parent / current_path.parent.name / f"{prev_month:02d}"
+    except Exception as e:
+        print(f"前月目录生成失败:{str(e)}")
+        return None
+
+
+#数据清洗逻
+def traverse_and_process(root_path, process_func, province_name="henan"):
+    """
+    通用分层遍历函数,支持不同省份的 parse_excel 入口
+
+    Args:
+        root_path (str): 根目录路径(如 downloads)
+        process_func (function): 每个省份自己的 parse_excel 函数
+        province_name (str): 省份名称,如 "henan", "shandong", "fujian"
+    """
+    root = Path(root_path)
+
+    # 获取年份目录(格式如 download/2025)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and YEAR_PATTERN.match(item.name)
+    ]
+
+    # 倒序年份
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+
+        if not year_dir.exists() or not year_dir.is_dir():
+            print(f"未找到 {province_name} 目录,跳过:{year_dir}")
+            continue
+
+        # 获取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+
+        # 倒序处理月份
+        if month_dirs:
+            print(f"\n年份:{year_dir.name} | 省份:{province_name}")
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"   月份:{md['month']:02d} | 路径:{md['path']}")
+                process_func(md['path'])  # 调用传入的处理函数