Przeglądaj źródła

Merge branch 'master' of http://42.192.203.166:3000/wyp/crossborder

01495251 1 tydzień temu
rodzic
commit
f07c9ade8a

+ 1 - 1
.idea/Crossborder.iml

@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.13 (Crossborder)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>

+ 1 - 0
.idea/misc.xml

@@ -3,4 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.13 (Crossborder)" />
   </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
 </project>

+ 7 - 0
.idea/vcs.xml

@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 306 - 0
anhui/crawl_gov_anhui_full.py

@@ -0,0 +1,306 @@
+import os
+import random
+import re
+import subprocess
+import time
+from pathlib import Path
+
+from faker import Faker
+from selenium import webdriver
+from selenium.webdriver import FirefoxOptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+import gov_commodity_anhui_city
+import gov_commodity_anhui_country
+import gov_commodity_anhui_import_export
+from utils import base_country_code, base_mysql
+
+download_dir = base_country_code.download_dir
+Path(download_dir).mkdir(parents=True, exist_ok=True)
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    print("当前下载路径:", Path(download_dir).resolve())
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
+    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+def find_target_links(driver):
+    """点击列表页链接进入详情页下载文件"""
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.ID, "conRight"))
+    )
+
+    try:
+        # 获取列表页所有 <a> 标签
+        elements = driver.find_elements(By.XPATH, '//ul[@class="conList_ul"]//a')
+        if not elements:
+            return None
+
+        processed_urls = set()
+        for link in elements:
+            link_url = link.get_attribute("href")
+            if link_url in processed_urls:
+                continue
+
+            # 新标签页打开链接
+            driver.execute_script("window.open(arguments[0]);", link_url)
+            driver.switch_to.window(driver.window_handles[-1])
+            print(f"正在处理详情页: {link_url}")
+
+            try:
+                # 在详情页下载文件
+                download_result = download_file_from_detail_page(driver)
+                if download_result == 'stop':
+                    return 'stop'
+                processed_urls.add(link_url)
+            finally:
+                # 关闭当前详情页并切回主窗口
+                driver.close()
+                driver.switch_to.window(driver.window_handles[0])
+
+            time.sleep(random.uniform(1, 3))
+
+        return None
+    except Exception as e:
+        print(f"下载时发生异常: {str(e)}")
+
+def download_file_from_detail_page(driver):
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.ID, "easysiteText"))
+    )
+
+    try:
+        elements = driver.find_elements(By.XPATH, '//div[@id="easysiteText"]//a')
+        if not elements:
+            print("详情页未找到目标文件链接")
+            return None
+
+        for download_btn in elements:
+            file_name = download_btn.text.strip()
+            if not file_name:
+                continue
+            if file_name.startswith('2022'):
+                return 'stop'
+            if '美元' in file_name or '商品贸易方式' in file_name or '进出口总值' in file_name or '月度表' in file_name:
+                print(f'{file_name} 不需要此文件,跳过')
+                continue
+
+            file_url = download_btn.get_attribute("href")
+            if not file_url.startswith(('http://', 'https://')):
+                base_url = driver.current_url.split('//')[0] + '//' + driver.current_url.split('/')[2]
+                file_url = base_url + file_url
+
+            if not file_url.lower().endswith(('.xls', '.xlsx')):
+                print(f"跳过非 Excel 文件: {file_url}")
+                continue
+
+            print(f"正在下载: {file_name} → {file_url}")
+
+            # 记录下载前的文件列表
+            existing_files = set(f.name for f in Path(download_dir).glob('*'))
+            # 随机点击延迟
+            time.sleep(random.uniform(1, 3))
+            download_btn.click()
+
+            downloaded_file = wait_for_download_complete(existing_files=existing_files)
+
+            year, start_month, month = extract_year_and_month(file_name)
+            final_path = Path(download_dir) / year / month / f"{file_name}"
+            if os.path.exists(final_path):
+                print(f"文件已存在:{file_name} 正在覆盖...")
+                os.unlink(final_path)
+
+            final_dir = Path(download_dir) / year / month
+            final_dir.mkdir(parents=True, exist_ok=True)
+            print(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
+            downloaded_file.rename(final_path)
+            print(f"√ 下载成功:{final_path} \n")
+
+        return None
+    except Exception as e:
+        print(f"详情页处理异常: {str(e)}")
+        return None
+
+def extract_year_and_month(file_name):
+    # 支持两种格式:
+    #  - 2025年1-2月xxx
+    #  - 2025年3月xxx
+    match = re.search(r"(\d{4})年(\d{1,2})(?:-(\d{1,2}))?月", file_name)
+
+    if match:
+        year = match.group(1)
+        start_month = match.group(2)
+        end_month = match.group(3) if match.group(3) else start_month
+
+        return year, start_month.zfill(2), end_month.zfill(2)
+    else:
+        raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
+
+def extract_rar(rar_path, extract_to):
+    """备用解压函数(当 rarfile 失效时使用)"""
+    winrar_path = r"C:\Program Files\WinRAR\Rar.exe"  # 推荐使用 Rar.exe 而非 WinRAR.exe
+    cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)]
+
+    # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口
+    creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0
+
+    result = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        creationflags=creationflags  # 关键点:隐藏窗口
+    )
+
+    if result.returncode == 0:
+        print(f"解压成功: {rar_path} → {extract_to}")
+        return True
+    else:
+        print(f"解压失败: {result.stderr.decode('gbk')}")
+        return False
+
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略
+        driver.get(url)
+
+        while True:
+            # 访问当前页
+            result = find_target_links(driver)
+            if result == 'stop':
+                break
+
+            # 等待页面加载完成
+            WebDriverWait(driver, 30).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "gg_page"))
+            )
+            # 模拟点击下一页
+            xpath = f'//div[@class="easysite-page-wrap"]//a[@title="下一页"]'
+            next_page_btn = WebDriverWait(driver, 15).until(
+                EC.element_to_be_clickable((By.XPATH, xpath))
+            )
+            # 获取下一页的URL
+            next_page_url = next_page_btn.get_attribute("onclick")
+            if not next_page_url:
+                print("已到达最后一页,停止爬取")
+                break
+            # 从onclick属性中提取URL
+            next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
+            if not next_page_url.startswith(('http://', 'https://')):
+                base_url = 'http://shijiazhuang.customs.gov.cn'  # 替换为实际的域名
+                next_page_url = base_url + next_page_url
+
+            # 访问下一页
+            driver.get(next_page_url)
+
+            print(f"开始爬取 {next_page_url} 页面数据")
+
+    finally:
+        driver.quit()
+
+def wait_for_download_complete(timeout=30, existing_files=None):
+    """
+    监控下载目录,等待文件下载完成并返回新下载的文件。
+
+    :param timeout: 超时时间(秒)
+    :param existing_files: 下载前已存在的文件列表
+    :return: 新下载的文件路径
+    """
+    start_time = time.time()
+    temp_exts = ('.part', '.crdownload')
+
+    if existing_files is None:
+        existing_files = set(f.name for f in Path(download_dir).glob('*'))
+
+    while (time.time() - start_time) < timeout:
+        # 获取有效文件列表
+        valid_files = []
+        for f in Path(download_dir).glob('*'):
+            if (f.name not in existing_files and
+                not f.name.endswith(temp_exts) and
+                f.stat().st_size > 0):
+                valid_files.append(f)
+
+        # 等待最新文件稳定
+        if valid_files:
+            return max(valid_files, key=lambda x: x.stat().st_mtime)
+        time.sleep(2)
+    raise TimeoutError("文件下载超时")
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                gov_commodity_anhui_import_export.process_folder(md['path'])
+                gov_commodity_anhui_country.process_folder(md['path'])
+                gov_commodity_anhui_city.process_folder(md['path'])
+
+if __name__ == "__main__":
+    crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html')
+    print(f"安徽合肥海关全量数据下载任务完成")
+    # 等待5s后执行
+    time.sleep(5)
+    hierarchical_traversal(base_country_code.download_dir)
+    print("安徽合肥海关类章、国家、城市所有文件处理完成!")
+    time.sleep(5)
+    base_mysql.update_january_yoy('安徽省')
+    base_mysql.update_shandong_yoy('安徽省')
+    print("安徽合肥海关城市同比sql处理完成")
+

+ 127 - 0
anhui/gov_commodity_anhui_city.py

@@ -0,0 +1,127 @@
+from pathlib import Path
+
+import pandas
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+city_code_map = {
+    "安徽省合肥市": "3401",
+    "安徽省芜湖市": "3402",
+    "安徽省蚌埠市": "3403",
+    "安徽省淮南市": "3404",
+    "安徽省马鞍山市": "3405",
+    "安徽省淮北市": "3406",
+    "安徽省铜陵市": "3407",
+    "安徽省安庆市": "3408",
+    "安徽省黄山市": "3410",
+    "安徽省滁州市": "3411",
+    "安徽省阜阳市": "3412",
+    "安徽省宿州市": "3413",
+    "安徽省六安市": "3415",
+    "安徽省亳州市": "3416",
+    "安徽省池州市": "3417",
+    "安徽省宣城市": "3418"
+}
+
+def get_df(path):
+    global df,  df_type
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return
+    for file in file_paths:
+        if "收发货人" in file.name:
+            print(f"处理多文件: {file.name}")
+            file_path = Path(path) / file
+            df = pd.read_excel(file_path, header=None).iloc[5:]
+            break
+    return df
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    sql_arr_copy = []
+    sql_arr = []
+    res = get_df(path)
+    if res is None:
+        print(f"{year_month} prov_region_trade 未找到包含 地市 sheet")
+        return
+    df = res
+    country_name_index = 0
+
+    for index, row in df.iterrows():
+
+        city_name = str(row.values[country_name_index]).strip()
+        city_code = city_code_map.get(city_name)
+        if not city_code:
+            print(f"未找到省 '{city_name}' 对应市编码")
+            continue
+
+        monthly_export, monthly_import, monthly_total, yoy_export, yoy_import, yoy_import_export = value_row(row, month)
+
+        if month == 2:
+            # 所有总额除2
+            year_month_2 = f'{year}-01'
+            monthly_import = round(float(monthly_import) / 2, 4)
+            monthly_export = round(float(monthly_export) / 2, 4)
+            monthly_total = round(float(monthly_total) / 2, 4)
+            yoy_import_export, yoy_import, yoy_export = 0, 0, 0
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+                   f"('{year}', '{year_month_2}', '340000', '安徽省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+            sql_arr_copy.append(sql)
+
+        # 组装 SQL 语句
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+               f"('{year}', '{year_month}', '340000', '安徽省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} prov_region_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    base_mysql.bulk_insert(sql_arr)
+    if month == 2:
+        print(f"√ {year_month} prov_region_trade 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_region_trade SQL 存表完成!")
+
+
+def value_row(row, month):
+    if month == 2:
+        col_total_index, col_export_index,  col_import_index, yoy_import_export_index, yoy_export_index, yoy_import_index = 1, 3, 5 ,2, 4, 6
+    else:
+        col_total_index, col_export_index,  col_import_index, yoy_import_export_index, yoy_export_index, yoy_import_index  = 3, 7, 11, 4, 8, 12
+    monthly_total = round(float(row.values[col_total_index]) * 10000, 4)
+    yoy_import_export = float(str(row.values[yoy_import_export_index]).strip())
+    monthly_export = round(float(row.values[col_export_index]) * 10000, 4)
+    yoy_export = float(str(row.values[yoy_export_index]).strip())
+    monthly_import = round(float(row.values[col_import_index]) * 10000, 4)
+    yoy_import = float(str(row.values[yoy_import_index]).strip())
+    return monthly_export, monthly_import, monthly_total, yoy_export, yoy_import, yoy_import_export
+
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"安徽合肥海关城市所有文件处理完成!")

+ 139 - 0
anhui/gov_commodity_anhui_country.py

@@ -0,0 +1,139 @@
+from pathlib import Path
+
+import pandas as pd
+
+from com.zf.crawl import base_country_code
+from com.zf.crawl import base_mysql
+
+from utils.base_country_code import format_sql_value
+
+# 排除地区名单
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
+
+def get_df(path):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    for file in file_paths:
+        if "国别" in file.name:
+            print(f"处理多文件: {file.name}")
+            file_path = Path(path) / file
+            return pd.read_excel(file_path, header=None).iloc[6:]
+    return None
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    sql_arr_copy = []
+    sql_arr = []
+    # try:
+    df = get_df(path)
+    if df is None:
+        print("未找到任何文件")
+        return None
+
+    country_name_index = 0
+    if month == 2:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 3, 5
+    else:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 3, 7, 11
+
+    for index, row in df.iterrows():
+        # 提取国家名称并去除括号内容
+        country_name = str(row.values[country_name_index]).strip()
+        if country_name.endswith(")") or country_name.endswith(")"):
+            country_name = country_name.rsplit("(")[0] or country_name.rsplit("(")[0]
+
+        # 过滤掉排除地区
+        if country_name in EXCLUDE_REGIONS:
+            continue
+
+        # 获取国家编码
+        country_code = base_country_code.COUNTRY_CODE_MAPPING.get(country_name)
+        if not country_code:
+            print(f"{year_month} 未找到国家 '{country_name}' 对应的编码")
+            continue
+
+        # 提取数据并格式化
+        monthly_export, monthly_import, monthly_total, yoy_export, yoy_import, yoy_import_export = \
+            value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index)
+
+        if month == 2:
+            # 所有总额除2
+            year_month_2 = f'{year}-01'
+            monthly_import = round(float(monthly_import) / 2, 4)
+            monthly_export = round(float(monthly_export) / 2, 4)
+            monthly_total = round(float(monthly_total) / 2, 4)
+            yoy_import_export, yoy_import, yoy_export = 0, 0, 0
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+                   f"('{year}', '{year_month_2}', '340000', '安徽省', '{country_code}', '{country_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+            sql_arr_copy.append(sql)
+
+        # 构建 SQL
+        sql = (
+            f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+            f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, "
+            f"monthly_total, monthly_export, monthly_import, yoy_import_export, yoy_import, yoy_export, create_time) "
+            f"VALUES ('{year}', '{year_month}', '340000', '安徽省', '{country_code}', '{country_name}', "
+            f"{format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', "
+            f"'{yoy_export}', NOW());"
+        )
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} 成功生成 SQL 条数: {len(sql_arr)}")
+    # 批量插入数据库
+    base_mysql.bulk_insert(sql_arr)
+    if month == 2:
+        print(f"√ {year_month} prov_country_trade 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_country_trade SQL 存表完成!\n")
+
+
+
+
+def value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index):
+    def value_special_handler(value):
+        if pd.isna(value) or value == "--":
+            return float(0)
+        else:
+            return float(value.strip())
+
+    monthly_total = round(value_special_handler(str(row.values[col_total_index])) * 10000, 4)
+    yoy_import_export = 0
+    monthly_export = round(value_special_handler(str(row.values[col_monthly_export_index])) * 10000, 4)
+    yoy_export = 0
+    monthly_import = round(value_special_handler(str(row.values[col_monthly_import_index])) * 10000, 4)
+    yoy_import = 0
+
+    return monthly_export, monthly_import, monthly_total, yoy_export, yoy_import, yoy_import_export
+
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print("安徽合肥海关国别所有文件处理完成!")

+ 179 - 0
anhui/gov_commodity_anhui_import_export.py

@@ -0,0 +1,179 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+
+from utils.base_country_code import format_sql_value
+
+CUSTOM_COMMODITY_REPLACEMENTS = {
+    '家具': '家具及其零件',
+    '眼镜': '眼镜及其零件',
+}
+# 需要保留中文括号及内容的商品关键词
+PRESERVE_PARENTHESES_KEYWORDS = {
+    '汽车(包括底盘)',
+}
+
+def clean_commodity_name(name, preserve_keywords=None):
+    """
+    自定义清洗商品名称逻辑,支持条件保留中文括号内容
+
+    :param name: 商品名称字符串
+    :param preserve_keywords: 需要保留括号的关键词集合
+    :return: 清洗后的商品名称
+    """
+    name = str(name).strip()
+
+    # 去除非必要符号
+    name = re.sub(r'[#*]', '', name)
+
+    # 判断是否需要保留括号内容
+    if preserve_keywords:
+        for keyword in preserve_keywords:
+            if keyword == name:
+                # 匹配到关键词时,不移除括号内容
+                return name
+
+    # 默认移除中文括号及内容
+    name = re.sub(r'([^)]*)', '', name)
+    return name.strip()
+
+def process_folder(path):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return
+    year, month = base_country_code.extract_year_month_from_path(path)
+
+    import_df = pd.DataFrame()
+    export_df = pd.DataFrame()
+    for file in file_paths:
+        file_path = Path(path) / file
+        df = pd.read_excel(file_path, header=None).iloc[6:]
+        value_index = 1 if month == 2 else 3
+        if "进口商品总值" in file.name:
+            temp_df = df[[0, value_index]].rename(columns={0: 'commodity', value_index: 'import'})
+            temp_df['commodity'] = (
+                temp_df['commodity']
+                .astype(str)
+                .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+                .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+            )
+            temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce')
+            # 去重 commodity 列,保留第一个出现的行
+            temp_df = temp_df.drop_duplicates(subset=['commodity'], keep='first')
+            import_df = pd.concat([import_df, temp_df])
+
+        elif "出口商品总值" in file.name:
+            temp_df = df[[0, value_index]].rename(columns={0: 'commodity', value_index: 'export'})
+            temp_df['commodity'] = (
+                temp_df['commodity']
+                .astype(str)
+                .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+                .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+            )
+            temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce')
+            temp_df = temp_df.drop_duplicates(subset=['commodity'], keep='first')
+            export_df = pd.concat([export_df, temp_df])
+
+    save_to_database(import_df, export_df, year, month)
+
+def save_to_database(import_df, export_df, year, month):
+    # 合并数据(使用outer join保留所有商品)
+    merged_df = pd.merge(
+        import_df.groupby('commodity')['import'].sum().reset_index(),
+        export_df.groupby('commodity')['export'].sum().reset_index(),
+        on='commodity',
+        how='outer'
+    )
+
+    year_month = f'{year}-{month:02d}'
+    processed_commodities = set()
+    sql_arr = []
+    sql_arr_copy = []
+    try:
+        for _, row in merged_df.iterrows():
+            commodity_name = str(row['commodity'])
+            if commodity_name == '肉类' or commodity_name == '其他' or commodity_name == '干鲜瓜果' or commodity_name == '钟表':
+                print(f'{commodity_name} 商品不存在,跳过')
+                continue
+            commodity_code, commodity_name_fix = base_mysql.get_commodity_id(commodity_name)
+            if not commodity_code:
+                print(f"未找到商品名称 '{commodity_name}' 对应的 ID")
+                continue
+            if not commodity_name_fix or commodity_name_fix in processed_commodities:
+                continue
+
+            monthly_import = round(row['import'] * 10000, 4)
+            monthly_export = round(row['export'] * 10000, 4)
+            monthly_total = round(
+                (0 if pd.isna(monthly_import) else monthly_import) +
+                (0 if pd.isna(monthly_export) else monthly_export),
+                4
+            )
+
+            if month == 2:
+                year_month_2 = f'{year}-01'
+                monthly_import = round(monthly_import / 2, 4)
+                monthly_export = round(monthly_export / 2, 4)
+                monthly_total = round(monthly_import + monthly_export, 4)
+                sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+                       f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time) VALUES "
+                       f"('{year}', '{year_month_2}', '340000', '安徽省', '{commodity_code}', '{commodity_name_fix}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now());")
+                sql_arr_copy.append(sql)
+
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time) VALUES "
+                   f"('{year}', '{year_month}', '340000', '安徽省', '{commodity_code}', '{commodity_name_fix}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now());")
+            sql_arr.append(sql)
+
+            processed_commodities.add(commodity_name_fix)
+            # print(f'{commodity_name} -> {commodity_name_fix}')
+
+    except Exception as e:
+        print(f"{year_month} prov_commodity_trade 生成 SQL 文件时发生异常: {str(e)}")
+
+    print(f"√ {year_month} prov_commodity_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    if month == 2:
+        print(f"√ {year_month} prov_commodity_trade copy 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_commodity_trade SQL 存表完成!\n")
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+
+    # root = Path(base_country_code.download_dir)/'2025'/'04'
+    # process_folder(root)
+    print("安徽合肥海关类章所有文件处理完成!")

+ 258 - 0
hebei/crawl_gov_hebei_full.py

@@ -0,0 +1,258 @@
+import os
+import random
+import re
+import time
+from pathlib import Path
+
+from faker import Faker
+from selenium import webdriver
+from selenium.webdriver import FirefoxOptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+from utils import base_country_code, base_mysql
+import gov_commodity_hebei_import_export
+import gov_commodity_hebei_country
+import gov_commodity_hebei_city
+
+download_dir = base_country_code.download_dir
+Path(download_dir).mkdir(parents=True, exist_ok=True)
+
+
+def get_current_target_titles():
+    return [
+        f"2025年4月河北分进口商品",
+        f"2025年4月河北分出口商品",
+        f"2025年4月河北分国家",
+        f"2025年4月河北分地市"
+    ]
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    print("当前下载路径:", Path(download_dir).resolve())
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
+    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+
+def remove_prefix_from_url(url):
+    # 分离路径和文件名
+    path_parts = url.split('/')
+    filename = path_parts[-1]
+
+    # 使用正则表达式去掉前缀数字和点(如 "1.")
+    new_filename = re.sub(r'^\d+\.', '', filename)
+
+    # 确保域名补全逻辑
+    if not url.startswith(('http://', 'https://')):
+        base_url = 'http://shijiazhuang.customs.gov.cn'
+        url = base_url + '/'.join(path_parts[:-1]) + '/' + new_filename
+
+    return url
+
+def find_target_links(driver):
+    """在当前页面找到符合 TARGET_TITLES 的文件并触发下载"""
+    # 等待页面加载完成
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+    )
+
+    element_arr = driver.find_elements(By.XPATH, '//div[@class="list_con"]//ul[@class="easysite-list-modelone"]//a')
+    if not element_arr:
+        print("未找到目标标题")
+        return None
+    for elements in element_arr:
+        file_name = elements.text.strip()
+        if not file_name:
+            continue
+        if file_name.startswith('2022'):
+            return 'stop'
+        if '进口商品' in file_name or '出口商品' in file_name or '分国家' in file_name or '分国别' in file_name or '地市' in file_name:
+            file_url = elements.get_attribute("href")
+            file_url = remove_prefix_from_url(file_url)
+
+            if not file_url.lower().endswith(('.xls', '.xlsx')):
+                print(f"跳过非 Excel 文件: {file_url}")
+                continue
+            print(f"正在下载: {file_name} → {file_url}")
+            # 记录下载前的文件列表
+            existing_files = set(f.name for f in Path(download_dir).glob('*'))
+            # 随机点击延迟
+            time.sleep(random.uniform(1, 3))
+            elements.click()
+
+            try:
+                downloaded_file = wait_for_download_complete(existing_files=existing_files)
+            except Exception as e:
+                print(f"下载失败: {str(e)}")
+                continue
+            year, start_month, month = extract_year_and_month(file_name)
+            final_path = Path(download_dir) / year / month / f"{file_name}.xls"
+            if os.path.exists(final_path):
+                print(f"文件已存在:{file_name} 正在覆盖...")
+                os.unlink(final_path)
+
+            final_dir = Path(download_dir) / year / month
+            final_dir.mkdir(parents=True, exist_ok=True)
+            print(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
+            downloaded_file.rename(final_path)
+            print(f"√ 下载成功:{final_path}")
+        else:
+            print(f'{file_name} 不需要此文件,跳过')
+            continue
+    return None
+
+
+def extract_year_and_month(file_name):
+    # 支持格式:1.2025年1-2月xxx 或 2025年3月xxx
+    match = re.search(r"\b(\d{4})年(\d{1,2})(?:-(\d{1,2}))?月", file_name)
+    if match:
+        year = match.group(1)
+        start_month = match.group(2)
+        end_month = match.group(3) if match.group(3) else start_month
+        return year, start_month.zfill(2), end_month.zfill(2)
+    else:
+        raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略
+        driver.get(url)
+
+        while True:
+            # 访问当前页
+            result = find_target_links(driver)
+            if result and result == 'stop':
+                break
+
+            # 等待页面加载完成
+            WebDriverWait(driver, 30).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "gg_page"))
+            )
+            # 模拟点击下一页
+            xpath = f'//div[@class="easysite-page-wrap"]//a[@title="下一页"]'
+            next_page_btn = WebDriverWait(driver, 15).until(
+                EC.element_to_be_clickable((By.XPATH, xpath))
+            )
+            # 获取下一页的URL
+            next_page_url = next_page_btn.get_attribute("onclick")
+            if not next_page_url:
+                print("已到达最后一页,停止爬取")
+                break
+            # 从onclick属性中提取URL
+            next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
+            if not next_page_url.startswith(('http://', 'https://')):
+                base_url = 'http://shijiazhuang.customs.gov.cn'  # 替换为实际的域名
+                next_page_url = base_url + next_page_url
+
+            # 访问下一页
+            driver.get(next_page_url)
+
+            print(f"开始爬取 {next_page_url} 页面数据")
+    finally:
+        driver.quit()
+
+def wait_for_download_complete(timeout=30, existing_files=None):
+    """
+    监控下载目录,等待文件下载完成并返回新下载的文件。
+
+    :param timeout: 超时时间(秒)
+    :param existing_files: 下载前已存在的文件列表
+    :return: 新下载的文件路径
+    """
+    start_time = time.time()
+    temp_exts = ('.part', '.crdownload')
+
+    if existing_files is None:
+        existing_files = set(f.name for f in Path(download_dir).glob('*'))
+
+    while (time.time() - start_time) < timeout:
+        # 获取有效文件列表
+        valid_files = []
+        for f in Path(download_dir).glob('*'):
+            if (f.name not in existing_files and
+                not f.name.endswith(temp_exts) and
+                f.stat().st_size > 0):
+                valid_files.append(f)
+
+        # 等待最新文件稳定
+        if valid_files:
+            return max(valid_files, key=lambda x: x.stat().st_mtime)
+        time.sleep(2)
+    raise TimeoutError("文件下载超时")
+
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                gov_commodity_hebei_import_export.process_folder(md['path'])
+                gov_commodity_hebei_country.process_folder(md['path'])
+                gov_commodity_hebei_city.process_folder(md['path'])
+
+
+if __name__ == "__main__":
+    crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html')
+    # 等待5s后执行
+    time.sleep(5)
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"河北石家庄海关全量数据下载任务完成")
+    time.sleep(5)
+    base_mysql.update_january_yoy('河北省')
+    base_mysql.update_shandong_yoy('河北省')
+    print("河北石家庄海关城市同比sql处理完成")

+ 125 - 0
hebei/gov_commodity_hebei_city.py

@@ -0,0 +1,125 @@
+from pathlib import Path
+
+import pandas
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+city_code_map = {
+    "石家庄市": "130100",
+    "唐山市": "130200",
+    "秦皇岛市": "130300",
+    "邯郸市": "130400",
+    "邢台市": "130500",
+    "保定市": "130600",
+    "张家口市": "130700",
+    "承德市": "130800",
+    "沧州市": "130900",
+    "廊坊市": "131000",
+    "衡水市": "131100",
+}
+
+def get_df(path):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    for file in file_paths:
+        if "地市" in file.name:
+            print(f"处理多文件: {file.name}")
+            file_path = Path(path) / file
+            return pd.read_excel(file_path, header=None).iloc[5:]
+    return None
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    df = get_df(path)
+    if df is None:
+        print("未找到任何文件")
+        return None
+
+    if year == 2025 and month >= 3:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 2, 10, 18
+    elif year_month in ['2023-02', '2025-01', '2024-01']:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 5, 9
+    else:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 9, 17
+    country_name_index = 1 if year == 2025 and month >= 3 else 0
+    sql_arr = []
+    sql_arr_copy = []
+    for index, row in df.iterrows():
+
+        city_name = str(row.values[country_name_index]).strip()
+        if city_name.startswith('河北省'):
+            city_name = city_name.lstrip('河北省')
+
+        city_code = city_code_map.get(city_name)
+        if not city_code:
+            print(f"未找到省 '{city_name}' 对应市编码")
+            continue
+
+        monthly_export, monthly_import, monthly_total = value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index)
+        yoy_export, yoy_import, yoy_import_export = 0, 0, 0
+        if year_month == '2023-02':
+            # 所有总额除2
+            monthly_import = round(float(monthly_import) / 2, 4)
+            monthly_export = round(float(monthly_export) / 2, 4)
+            monthly_total = round(float(monthly_total) / 2, 4)
+            sql_1 = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+                   f"('2023', '2023-01', '130000', '河北省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+            sql_arr_copy.append(sql_1)
+
+        # 组装 SQL 语句
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+               f"('{year}', '{year_month}', '130000', '河北省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} prov_region_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    if year_month == '2023-02':
+        print(f"√ {year_month} sql_arr_copy 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_region_trade SQL 存表完成!")
+
+
+def value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index):
+    monthly_total = str(row.values[col_total_index]).strip()
+    monthly_export = str(row.values[col_monthly_export_index]).strip()
+    monthly_import = str(row.values[col_monthly_import_index]).strip()
+    return monthly_export, monthly_import, monthly_total
+
+def value_special_handler(value):
+    if pandas.isna(value) or value == "--":
+        return "0"
+    else:
+        return value
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"河北石家庄海关城市所有文件处理完成!")

+ 120 - 0
hebei/gov_commodity_hebei_country.py

@@ -0,0 +1,120 @@
+from pathlib import Path
+
+import pandas
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
+
+def get_df(path):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    for file in file_paths:
+        if "国" in file.name:
+            print(f"处理多文件: {file.name}")
+            file_path = Path(path) / file
+            return pd.read_excel(file_path, header=None).iloc[6:]
+    return None
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    df = get_df(path)
+    if df is None:
+        print("未找到任何文件")
+        return None
+
+    if year == 2025 and month >= 3:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 2, 10, 18
+    elif year_month in ['2023-02', '2025-01', '2024-01']:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 5, 9
+    else:
+        col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 9, 17
+
+    country_name_index = 1 if year == 2025 and month >= 3 else 0
+    continue_index = 6
+    sql_arr = []
+    sql_arr_copy = []
+    for index, row in df.iterrows():
+        if index < continue_index:
+            continue
+
+        country_name = str(row.values[country_name_index]).strip()
+        if country_name.endswith(")") or country_name.endswith(")"):
+            country_name = country_name.rsplit("(")[0] or country_name.rsplit("(")[0]
+
+        if  country_name in EXCLUDE_REGIONS:
+            continue
+
+        country_code = base_country_code.COUNTRY_CODE_MAPPING.get(country_name)
+        if not country_code:
+            print(f"{year_month} 未找到国家 '{country_name}' 对应国家的编码")
+            continue
+
+        monthly_export, monthly_import, monthly_total = value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index)
+        yoy_export, yoy_import, yoy_import_export = 0, 0, 0
+        if year_month == '2023-02':
+            # 所有总额除2
+            monthly_import = round(float(monthly_import) / 2, 4)
+            monthly_export = round(float(monthly_export) / 2, 4)
+            monthly_total = round(float(monthly_total) / 2, 4)
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+                   f"('2023', '2023-01', '130000', '河北省', '{country_code}', '{country_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+            sql_arr_copy.append(sql)
+        # 组装 SQL 语句
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+               f"('{year}', '{year_month}', '130000', '河北省', '{country_code}', '{country_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} prov_country_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    if year_month == '2023-02':
+        print(f"√ {year_month} prov_country_trade 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_country_trade SQL 存表完成!")
+
+def value_row(row,col_total_index, col_monthly_export_index, col_monthly_import_index):
+    monthly_total = str(row.values[col_total_index]).strip()
+    monthly_export = str(row.values[col_monthly_export_index]).strip()
+    monthly_import = str(row.values[col_monthly_import_index]).strip()
+    return monthly_export, monthly_import, monthly_total
+
+def value_special_handler(value):
+    if pandas.isna(value) or value == "--" :
+        return "0"
+    else:
+        return value
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"河北石家庄海关国家的所有文件处理完成!")

+ 178 - 0
hebei/gov_commodity_hebei_import_export.py

@@ -0,0 +1,178 @@
+from pathlib import Path
+
+import pandas as pd
+import re
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+CUSTOM_COMMODITY_REPLACEMENTS = {
+    '稻谷及大米': '稻谷、大米及大米粉',
+    '有机发光二极管平板显示模组': '有机发光二极管(OLED)平板显示模组',
+}
+
+PRESERVE_PARENTHESES_KEYWORDS = {
+    '汽车(包括底盘)',
+}
+
+def clean_commodity_name(name, preserve_keywords=None):
+    """
+    自定义清洗商品名称逻辑,支持条件保留中文括号内容
+
+    :param name: 商品名称字符串
+    :param preserve_keywords: 需要保留括号的关键词集合
+    :return: 清洗后的商品名称
+    """
+    name = str(name).strip().replace('(', '(').replace(')', ')')
+
+    # 去除非必要符号
+    name = re.sub(r'[#*?]', '', name)
+    name = re.sub(r'_x000D_', '', name)
+
+    # 判断是否需要保留括号内容
+    if preserve_keywords:
+        for keyword in preserve_keywords:
+            if keyword == name:
+                # 匹配到关键词时,不移除括号内容
+                return name
+
+    # 默认移除中文括号及内容
+    name = re.sub(r'([^)]*)', '', name)
+    return name.strip()
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+
+    name_index = 1 if year == 2025 and month >= 3 else 0
+    value_index = 5 if year == 2025 and month >= 3 else 4
+    res = df_data(path, name_index, value_index)
+    if not res:
+        print(f"{path} 上月目录里文件未找到包含 主出、主进商品 sheet")
+        return
+    export_df, import_df = res
+
+    merged_df = pd.merge(
+        import_df.groupby('commodity')['import'].sum().reset_index() if not import_df.empty else pd.DataFrame(columns=['commodity', 'import']),
+        export_df.groupby('commodity')['export'].sum().reset_index() if not export_df.empty else pd.DataFrame(columns=['commodity', 'export']),
+        on='commodity',
+        how='outer'
+    ).infer_objects()
+
+    save_to_database(merged_df, year, month)
+
+def save_to_database(merged_df, year, month):
+    processed_commodities = set()
+    sql_arr = []
+    sql_arr_copy = []
+    year_month = f'{year}-{month:02d}'
+    try:
+        for _, row in merged_df.iterrows():
+            commodity_name = str(row['commodity']).strip()
+            commodity_code,commodity_name_fix = base_mysql.get_commodity_id(commodity_name)
+            if not commodity_code:
+                print(f"未找到商品名称 '{commodity_name}' 对应的 ID")
+                continue
+            if not commodity_name_fix or commodity_name_fix in processed_commodities:
+                continue
+
+            monthly_import = round(row['import'], 4)
+            monthly_export = round(row['export'], 4)
+            monthly_total = round(monthly_import + monthly_export, 4)
+
+            if year_month == '2023-02':
+                monthly_import = round(monthly_import / 2, 4)
+                monthly_export = round(monthly_export / 2, 4)
+                monthly_total = round(monthly_import + monthly_export, 4)
+                sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+                       f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time) VALUES "
+                       f"('2023', '2023-01', '130000', '河北省', '{commodity_code}', '{commodity_name_fix}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now());")
+                sql_arr_copy.append(sql)
+
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time) VALUES "
+                   f"('{year}', '{year_month}', '130000', '河北省', '{commodity_code}', '{commodity_name_fix}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now());")
+            sql_arr.append(sql)
+
+            processed_commodities.add(commodity_name_fix)
+
+    except Exception as e:
+        print(f"{year_month} prov_commodity_trade 生成 SQL 文件时发生异常: {str(e)}")
+
+    print(f"√ {year_month} prov_commodity_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    if year_month == '2023-02':
+        print(f"√ {year_month} prov_commodity_trade copy 成功生成 SQL 文件 size {len(sql_arr_copy)} ")
+        base_mysql.bulk_insert(sql_arr_copy)
+    print(f"√ {year_month} prov_commodity_trade SQL 存表完成!")
+
+
+def df_data(path, name_index, value_index):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    import_df = pd.DataFrame()
+    export_df = pd.DataFrame()
+    for file_path in file_paths:
+        if '出口' in file_path.name:
+            df = pd.read_excel(file_path, header=None).iloc[5:]
+            temp_df = df[[name_index, value_index]].rename(columns={name_index: 'commodity', value_index: 'export'})
+            # temp_df['commodity'] = temp_df['commodity'].str.replace(r'[*#]', '', regex=True)
+            temp_df['commodity'] = (
+                temp_df['commodity']
+                .astype(str)
+                .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+                .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+            )
+            temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce')
+            temp_df = temp_df.drop_duplicates(subset=['commodity'], keep='first')
+            export_df = pd.concat([export_df, temp_df])
+
+        if '进口' in file_path.name:
+            df = pd.read_excel(file_path, header=None).iloc[5:]
+            temp_df = df[[name_index, value_index]].rename(columns={name_index: 'commodity', value_index: 'import'})
+            temp_df['commodity'] = (
+                temp_df['commodity']
+                .astype(str)
+                .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+                .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+            )
+            temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce')
+            temp_df = temp_df.drop_duplicates(subset=['commodity'], keep='first')
+            import_df = pd.concat([import_df, temp_df])
+    return export_df, import_df
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+
+    # root = Path(base_country_code.download_dir)/'2023'/'02'
+    # process_folder(root)
+    print(f"河北石家庄海关出入口商品所有文件处理完成!")

+ 295 - 0
jiangsu/crawl_gov_jiangsu_full.py

@@ -0,0 +1,295 @@
+import os
+import random
+import re
+import subprocess
+import time
+import rarfile
+import shutil
+from pathlib import Path
+
+from faker import Faker
+from selenium import webdriver
+from selenium.webdriver import FirefoxOptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+import gov_commodity_jiangsu_country
+import gov_commodity_jiangsu_city
+import gov_commodity_jiangsu_import_export
+
+from utils import base_country_code, base_mysql
+
+# 显式指定 unrar 路径(根据实际情况修改)
+rarfile.UNRAR_EXECUTABLE = r"C:\Program Files\WinRAR\UnRAR.exe"
+# rarfile.UNRAR_EXECUTABLE = "/usr/bin/unrar"  # Linux/macOS
+download_dir = base_country_code.download_dir
+Path(download_dir).mkdir(parents=True, exist_ok=True)
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    print("当前下载路径:", Path(download_dir).resolve())
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
+    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+def find_target_links(driver):
+    """在当前页面找到符合 TARGET_TITLES 的文件并触发下载"""
+    # 等待页面加载完成
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+    )
+
+    try:
+        # 使用 XPath 精准匹配标题文本
+        xpath = '//ul[@class="conList_ul"]//a[contains(@href, ".rar")]'
+        # 检查页面中是否存在该 title 对应的元素
+        elements = driver.find_elements(By.XPATH, xpath)
+        if not elements:
+            return None
+
+        # 用于记录已处理过的文件名(防止重复下载)
+        processed_files = set()
+
+        # 遍历所有链接并点击下载
+        for download_btn in elements:
+            # 获取文件名(用于后续判断)
+            file_name = download_btn.text.strip()
+            print(f"正在下载: {file_name}")
+
+            # 记录下载前的文件列表
+            existing_files = set(f.name for f in Path(download_dir).glob('*'))
+
+            # 模拟点击
+            download_btn.click()
+            time.sleep(random.uniform(1, 3))
+
+            # 等待文件下载完成
+            rar_files = wait_for_download_complete(existing_files=existing_files)
+            if not rar_files:
+                print("未找到新下载的 .rar 文件")
+                continue
+
+            downloaded_file = rar_files[0]
+            if downloaded_file.suffix == '.rar':
+                # 解压文件
+                with rarfile.RarFile(downloaded_file) as rf:
+                    # 获取压缩包中的第一个 .xls 文件
+                    xls_files = [f for f in rf.namelist() if f.endswith('.xls') or f.endswith('.xlsx')]
+                    if not xls_files:
+                        print(f"压缩包 {downloaded_file.name} 中没有 .xls 文件")
+                        continue
+
+                    for xls_file in xls_files:
+                        if xls_file.startswith('2022'):
+                            return 'stop'
+                        if not xls_file or '美元值' in xls_file or '企业性质' in xls_file or '贸易方式' in xls_file or '按收发货所在地' in xls_file or '主要商品' in xls_file:
+                            print(f"检测到不需要的文件:{xls_file},跳过")
+                            continue
+                        # 解压到临时目录
+                        temp_dir = Path(download_dir) / 'temp'
+                        temp_dir.mkdir(parents=True, exist_ok=True)
+                        if not extract_rar(downloaded_file, temp_dir):
+                            print(f"解压文件 {downloaded_file.name} 时发生错误")
+                            continue
+                        # 获取解压后的文件路径
+                        match = re.search(r"(\d{4})年(\d{1,2})月", xls_file)
+                        if not match:
+                            raise ValueError(f"无效标题格式:{xls_file}")
+                        year = match.group(1)
+                        month = match.group(2).zfill(2)
+
+                        extracted_file = temp_dir / xls_file
+                        final_path = Path(download_dir) / year / month / extracted_file.name
+                        if os.path.exists(final_path):
+                            print(f"文件已存在:{extracted_file.name} 正在覆盖...")
+                            os.unlink(final_path)
+
+                        final_dir = Path(download_dir) / year / month
+                        final_dir.mkdir(parents=True, exist_ok=True)
+                        print(f"√ 正在移动文件 {extracted_file} 至 {final_path}")
+                        try:
+                            extracted_file.rename(final_path)
+                            print(f"√ 下载成功:{final_path}")
+                        except Exception as e:
+                            print(f"文件移动失败: {str(e)}")
+
+                    # 删除临时目录(无论是否为空)
+                    try:
+                        shutil.rmtree(temp_dir)  # 替换 os.rmdir(temp_dir)
+                    except Exception as e:
+                        print(f"删除临时目录失败: {str(e)}")
+
+                # 删除 .rar 文件
+                print(f"删除 .rar 文件:{downloaded_file}")
+                os.unlink(downloaded_file)
+            else:
+                print(f"文件 {downloaded_file.name} 不是 .rar 文件,请手动处理")
+            # 将已处理的文件名加入集合
+            processed_files.add(file_name)
+        return None
+    except Exception as e:
+        print(f"下载时发生异常: {str(e)}")
+
+def extract_rar(rar_path, extract_to):
+    """备用解压函数(当 rarfile 失效时使用)"""
+    winrar_path = r"C:\Program Files\WinRAR\Rar.exe"  # 推荐使用 Rar.exe 而非 WinRAR.exe
+    cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)]
+
+    # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口
+    creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0
+
+    result = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        creationflags=creationflags  # 关键点:隐藏窗口
+    )
+
+    if result.returncode == 0:
+        print(f"解压成功: {rar_path} → {extract_to}")
+        return True
+    else:
+        print(f"解压失败: {result.stderr.decode('gbk')}")
+        return False
+
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略
+        driver.get(url)
+
+        while True:
+            # 访问当前页
+            result = find_target_links(driver)
+            if result == 'stop':
+                break
+
+            # 等待页面加载完成
+            WebDriverWait(driver, 30).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "gg_page"))
+            )
+            # 模拟点击下一页
+            xpath = f'//div[@class="easysite-page-wrap"]//a[@title="下一页"]'
+            next_page_btn = WebDriverWait(driver, 15).until(
+                EC.element_to_be_clickable((By.XPATH, xpath))
+            )
+            # 获取下一页的URL
+            next_page_url = next_page_btn.get_attribute("onclick")
+            if not next_page_url:
+                print("已到达最后一页,停止爬取")
+                break
+            # 从onclick属性中提取URL
+            next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
+            if not next_page_url.startswith(('http://', 'https://')):
+                base_url = 'http://shijiazhuang.customs.gov.cn'  # 替换为实际的域名
+                next_page_url = base_url + next_page_url
+
+            # 访问下一页
+            driver.get(next_page_url)
+
+            print(f"开始爬取 {next_page_url} 页面数据")
+
+    finally:
+        driver.quit()
+
+
+def wait_for_download_complete(timeout=30, existing_files=None):
+    start_time = time.time()
+    temp_exts = ('.part', '.crdownload')
+
+    if existing_files is None:
+        existing_files = set(f.name for f in Path(download_dir).glob('*'))
+
+    while (time.time() - start_time) < timeout:
+        current_files = set(f.name for f in Path(download_dir).glob('*'))
+        new_files = [f for f in Path(download_dir).glob('*.rar') if f.name not in existing_files]
+        if new_files:
+            # 等待文件大小稳定(不再变化),确保下载完成
+            stable = True
+            for file in new_files:
+                prev_size = file.stat().st_size
+                time.sleep(1)
+                curr_size = file.stat().st_size
+                if curr_size != prev_size:
+                    stable = False
+                    break
+            if stable:
+                return new_files
+        time.sleep(2)
+    raise TimeoutError("未找到 .rar 文件或超时")
+
+def hierarchical_traversal(root_path, all_records):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                gov_commodity_jiangsu_import_export.process_folder(md['path'], all_records)
+                gov_commodity_jiangsu_country.process_folder(md['path'])
+                gov_commodity_jiangsu_city.process_folder(md['path'])
+
+if __name__ == "__main__":
+    crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html')
+    print(f"江苏南京海关全量数据下载任务完成")
+    # 等待5s后执行
+    time.sleep(5)
+    all_records = base_mysql.get_hs_all()
+    hierarchical_traversal(base_country_code.download_dir, all_records)
+    print("江苏南京海关类章、国家、城市所有文件处理完成!")
+    time.sleep(5)
+    base_mysql.update_january_yoy('江苏省')
+    base_mysql.update_shandong_yoy('江苏省')
+    print("江苏南京海关城市同比sql处理完成")
+

+ 136 - 0
jiangsu/gov_commodity_jiangsu_city.py

@@ -0,0 +1,136 @@
+import time
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+city_code_map = {
+    "南京市": "3201",
+    "无锡市": "3202",
+    "徐州市": "3203",
+    "常州市": "3204",
+    "苏州市": "3205",
+    "南通市": "3206",
+    "连云港市": "3207",
+    "淮安市": "3208",
+    "盐城市": "3209",
+    "扬州市": "3210",
+    "镇江市": "3211",
+    "泰州市": "3212",
+    "宿迁市": "3213"
+}
+
+ignore_city_code_arr = ['江阴市','宜兴市','常熟市','张家港市','昆山市','吴江市','太仓市','启东市','东台市','仪征市','丹阳市','兴化市']
+
+def get_df(path):
+    global df,  df_type
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return
+    if len(file_paths) == 1:
+        file_path = file_paths[0]
+        print(f"处理单文件: {file_path.name}")
+        xls = pd.ExcelFile(file_path)
+
+        sheet_name = base_country_code.find_sheet_by_keyword(file_path, "地")
+        if not sheet_name:
+            print(f"{file_path} 未找到包含 地市 sheet")
+            return None
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None).iloc[5:]
+        df_type = 0
+
+    else:
+        for file in file_paths:
+            if "地区" in file.name:
+                print(f"处理多文件: {file.name}")
+                file_path = Path(path) / file
+                df = pd.read_excel(file_path, header=None).iloc[6:]
+                df_type = 1
+                break
+    return df, df_type
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    sql_arr = []
+    res = get_df(path)
+    if res is None:
+        print(f"{year_month} prov_region_trade 未找到包含 地市 sheet")
+        return
+    df, df_type = res
+    if df_type == 0:
+        country_name_index = 0
+        col_total_index = 1
+    else:
+        country_name_index = 1
+        col_total_index  = 2
+
+    for index, row in df.iterrows():
+        city_name = str(row.values[country_name_index]).strip()
+        flag = False
+        for ignore_city_code in ignore_city_code_arr:
+            if city_name.startswith('其中') or ignore_city_code.endswith(city_name):
+                flag = True
+                break
+        if flag:
+            print(f"忽略 {city_name}")
+            continue
+        city_code = city_code_map.get(city_name)
+        if not city_code:
+            print(f"未找到省 '{city_name}' 对应市编码")
+            continue
+
+        monthly_export, monthly_import, monthly_total = value_row(row, col_total_index)
+        if df_type == 0:
+            monthly_export, monthly_import, monthly_total = round(float(monthly_export) * 10000, 4), round(float(monthly_import) * 10000, 4), round(float(monthly_total) * 10000, 4)
+        yoy_export, yoy_import, yoy_import_export = 0, 0, 0
+
+        # 组装 SQL 语句
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+               f"('{year}', '{year_month}', '320000', '江苏省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} prov_region_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_region_trade SQL 存表完成!")
+
+
+def value_row(row,col_total_index):
+    monthly_total = str(row.values[col_total_index]).strip()
+    monthly_export = str(row.values[col_total_index + 2]).strip()
+    monthly_import = str(row.values[col_total_index + 4]).strip()
+    return monthly_export, monthly_import, monthly_total
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"江苏南京海关城市所有文件处理完成!")
+    time.sleep(5)
+    base_mysql.update_january_yoy('江苏省')
+    base_mysql.update_shandong_yoy('江苏省')
+    print("江苏南京同比sql处理完成")

+ 135 - 0
jiangsu/gov_commodity_jiangsu_country.py

@@ -0,0 +1,135 @@
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+# 排除地区名单
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
+
+def get_df(path):
+    global df,  df_type
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return
+    if len(file_paths) == 1:
+        file_path = file_paths[0]
+        print(f"处理单文件: {file_path.name}")
+        xls = pd.ExcelFile(file_path)
+
+        sheet_name = base_country_code.find_sheet_by_keyword(file_path, "国别")
+        if not sheet_name:
+            print(f"{file_path} 未找到包含 类章 sheet")
+            return None
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None).iloc[5:]
+        df_type = 0
+
+    else:
+        for file in file_paths:
+            if "国别" in file.name:
+                print(f"处理多文件: {file.name}")
+                file_path = Path(path) / file
+                df = pd.read_excel(file_path, header=None).iloc[6:]
+                df_type = 1
+                break
+    return df, df_type
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    sql_arr = []
+    try:
+        df, df_type = get_df(path)
+        if df_type == 0:
+            country_name_index = 0
+            col_total_index, col_monthly_export_index, col_monthly_import_index = 1, 3, 5
+        else:
+            country_name_index = 1
+            col_total_index, col_monthly_export_index, col_monthly_import_index = 2, 4, 6
+
+        for index, row in df.iterrows():
+            if index < 4:
+                continue
+
+            # 提取国家名称并去除括号内容
+            country_name = str(row.values[country_name_index]).strip()
+            if country_name.endswith(")") or country_name.endswith(")"):
+                country_name = country_name.rsplit("(")[0] or country_name.rsplit("(")[0]
+
+            # 过滤掉排除地区
+            if country_name in EXCLUDE_REGIONS:
+                continue
+
+            # 获取国家编码
+            country_code = base_country_code.COUNTRY_CODE_MAPPING.get(country_name)
+            if not country_code:
+                print(f"{year_month} 未找到国家 '{country_name}' 对应的编码")
+                continue
+
+            # 提取数据并格式化
+            monthly_export, monthly_import, monthly_total = value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index)
+            if df_type == 0:
+                monthly_export, monthly_import, monthly_total = round(float(monthly_export) * 10000, 4), round(float(monthly_import) * 10000, 4), round(float(monthly_total) * 10000, 4)
+            yoy_export, yoy_import, yoy_import_export = 0, 0, 0
+
+            # 构建 SQL
+            sql = (
+                f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+                f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, "
+                f"monthly_total, monthly_export, monthly_import, yoy_import_export, yoy_import, yoy_export, create_time) "
+                f"VALUES ('{year}', '{year_month}', '320000', '江苏省', '{country_code}', '{country_name}', "
+                f"{format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', "
+                f"'{yoy_export}', NOW());"
+            )
+            sql_arr.append(sql)
+    except Exception as e:
+        print(f"{year_month} 处理时发生异常: {str(e)}")
+
+    print(f"√ {year_month} 成功生成 SQL 条数: {len(sql_arr)}")
+    # 批量插入数据库
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_country_trade SQL 存表完成!")
+
+def value_row(row, col_total_index, col_monthly_export_index, col_monthly_import_index):
+    def value_special_handler(value):
+        if pd.isna(value) or value == "--":
+            return "0"
+        else:
+            return value.strip()
+
+    monthly_total = value_special_handler(str(row.values[col_total_index]))
+    monthly_export = value_special_handler(str(row.values[col_monthly_export_index]))
+    monthly_import = value_special_handler(str(row.values[col_monthly_import_index]))
+
+    return monthly_export, monthly_import, monthly_total
+
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print("江苏南京海关国别所有文件处理完成!")

+ 171 - 0
jiangsu/gov_commodity_jiangsu_import_export.py

@@ -0,0 +1,171 @@
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+all_records = []
+
+
+def process_folder(path, all_records):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    if len(file_paths) == 1:
+        file_path = file_paths[0]
+        print(f"处理单文件: {file_path.name}")
+        # 读取所有sheet
+        xls = pd.ExcelFile(file_path)
+        import_df = pd.DataFrame()
+        export_df = pd.DataFrame()
+        total_df = pd.DataFrame()
+        sheet_name = base_country_code.find_sheet_by_keyword(file_path, "类章")
+        if not sheet_name:
+            print(f"{file_path} 未找到包含 类章 sheet")
+            return
+        skip_index = 4 if year_month == '2024-11' else 5
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None).iloc[skip_index:]
+        temp_df = df[[0, 5]].rename(columns={0: 'commodity', 5: 'import'})
+        temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce')
+        temp_df['import'] = temp_df['import'] * 10000
+        import_df = pd.concat([import_df, temp_df])
+
+        temp_df = df[[0, 3]].rename(columns={0: 'commodity', 3: 'export'})
+        temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce')
+        temp_df['export'] = temp_df['export'] * 10000
+        export_df = pd.concat([export_df, temp_df])
+
+        temp_df = df[[0, 1]].rename(columns={0: 'commodity', 1: 'total'})
+        temp_df['total'] = pd.to_numeric(temp_df['total'].replace('--', 0), errors='coerce')
+        temp_df['total'] = temp_df['total'] * 10000
+        total_df = pd.concat([total_df, temp_df])
+
+        save_to_database(import_df, export_df, total_df, year, month, all_records)
+    else: # 2024-10 -2023-01
+        import_df = pd.DataFrame()
+        export_df = pd.DataFrame()
+        total_df = pd.DataFrame()
+        for file in file_paths:
+            if "商品类章" in file.name:
+                print(f"处理多文件: {file.name}")
+                file_path = Path(path) / file
+                df = pd.read_excel(file_path, header=None).iloc[6:]
+
+                temp_df = df[[1, 5]].rename(columns={1: 'commodity', 5: 'import'})
+                temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce')
+                import_df = pd.concat([import_df, temp_df])
+
+                temp_df = df[[1, 3]].rename(columns={1: 'commodity', 3: 'export'})
+                temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce')
+                export_df = pd.concat([export_df, temp_df])
+
+                temp_df = df[[1, 2]].rename(columns={1: 'commodity', 2: 'total'})
+                temp_df['total'] = pd.to_numeric(temp_df['total'].replace('--', 0), errors='coerce')
+                total_df = pd.concat([total_df, temp_df])
+                break
+
+        save_to_database(import_df, export_df, total_df, year, month, all_records)
+
+def save_to_database(import_df, export_df, total_df, year, month, all_records):
+    # 直接合并,不使用 groupby,保持原始顺序
+    merged_df = pd.concat(
+        [import_df.set_index('commodity'), export_df.set_index('commodity'), total_df.set_index('commodity')], axis=1,
+        join='outer').reset_index()
+    merged_df = merged_df
+    merged_df['original_order'] = merged_df.index  # 保留原始顺序
+    merged_df = merged_df.sort_values('original_order').reset_index(drop=True)
+
+    sql_arr = []
+    processed_commodities = set()
+
+    all_records_index = 0
+    year_month = f'{year}-{month:02d}'
+    for _, row in merged_df.iterrows():
+        commodity_name = str(row['commodity'])
+        # commodity_name = str(row['commodity']).strip()
+        # 找类名确定索引
+        result = extract_category_or_chapter(commodity_name, all_records_index)
+        if result is None:
+            print(f"未找到商品名称 '{commodity_name}' 对应的ID")
+            continue
+        if result >= len(all_records):
+            print(f"all_records 已超限 '{commodity_name}' 跳过")
+            continue
+        all_records_index = result
+
+        commodity_code, category_name = int(all_records[all_records_index][0]), str(all_records[all_records_index][1])
+        if commodity_code in processed_commodities:
+            continue
+
+        monthly_import = round(row['import'], 4)
+        monthly_export = round(row['export'], 4)
+        monthly_total = round(row['total'], 4)
+
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time, commodity_source) VALUES "
+               f"('{year}', '{year_month}', '320000', '江苏省', '{commodity_code}', '{category_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now(), 1);")
+        sql_arr.append(sql)
+
+        processed_commodities.add(commodity_code)
+    # except Exception as e:
+    #     print(f"{year_month} 生成SQL时发生异常: {str(e)}")
+
+    # 原有SQL执行逻辑
+    print(f"√ {year_month} 成功生成SQL文件 size {len(sql_arr)} ")
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_commodity_trade SQL 存表完成!")
+
+
+def extract_category_or_chapter(text, all_records_index):
+    text = text.strip()
+
+    # 匹配“第一类”或“第1类”
+    first_class_match = re.match(r'^第(一|\d+)类', text, re.IGNORECASE | re.UNICODE)
+    if first_class_match and (first_class_match.group(1) == '1' or first_class_match.group(1) == '一'):
+        return 0
+    else:
+        return all_records_index + 1
+
+def hierarchical_traversal(root_path, all_records):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'], all_records)
+
+if __name__ == '__main__':
+    all_records = base_mysql.get_hs_all()
+    hierarchical_traversal(base_country_code.download_dir, all_records)
+
+    # root = Path(base_country_code.download_dir)/'2024'/'11'
+    # process_folder(root, all_records)
+    print("江苏南京海关类章所有文件处理完成!")

+ 363 - 0
utils/base_country_code.py

@@ -0,0 +1,363 @@
+import os
+import re
+from pathlib import Path
+
+import pandas as pd
+from openpyxl import load_workbook
+
+YEAR_PATTERN = re.compile(r"^\d{4}$")
+MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")
+
+def format_sql_value(value):
+    """
+    将 Python 值转换为 SQL 可识别的格式:
+    - None -> NULL
+    - 数值 -> 原样输出
+    - 字符串 -> 加单引号
+    """
+    if pd.isna(value):
+        return 'NULL'
+    elif isinstance(value, (int, float)):
+        return str(value)
+    else:
+        return f"'{value}'"
+
+def find_sheet_by_keyword(file_path, keyword):
+    """
+    模糊查找包含关键字的 sheet 名称(支持 .xls 和 .xlsx)
+
+    :param file_path: Excel 文件路径
+    :param keyword: 要匹配的关键字(如 '类章')
+    :return: 匹配到的第一个 sheet 名称,或 None
+    """
+    # 处理 .xlsx 文件
+    if file_path.suffix == ".xlsx":
+        workbook = load_workbook(filename=file_path, read_only=True)
+        sheets = workbook.sheetnames
+    # 处理 .xls 文件
+    elif file_path.suffix == ".xls":
+        import xlrd
+        workbook = xlrd.open_workbook(file_path)
+        sheets = workbook.sheet_names()
+    else:
+        raise ValueError(f"不支持的文件格式:{file_path.suffix}")
+
+    # 精确匹配 + 模糊匹配策略
+    for sheet in sheets:
+        if keyword.lower() in sheet.lower():
+            return sheet
+    return None
+
+def get_previous_month_dir(current_path):
+    """生成前月目录路径"""
+    try:
+        year_part = current_path.parent.name
+        month_part = current_path.name
+
+        if not (YEAR_PATTERN.match(year_part) and MONTH_PATTERN.match(month_part)):
+            return None
+
+        prev_month = int(month_part) - 1
+        if prev_month < 1:
+            return None
+
+        return current_path.parent.parent / current_path.parent.name / f"{prev_month:02d}"
+    except Exception as e:
+        print(f"前月目录生成失败:{str(e)}")
+        return None
+
+COUNTRY_CODE_MAPPING = {
+    # ================= 亚洲 =================
+    "阿富汗": "AF",
+    "巴林": "BH",
+    "孟加拉国": "BD",
+    "不丹": "BT",
+    "文莱": "BN",
+    "缅甸": "MM",
+    "柬埔寨": "KH",
+    "塞浦路斯": "CY",
+    "朝鲜": "KP",
+    "中国香港": "HK",
+    "印度": "IN",
+    "印度尼西亚": "ID",
+    "伊朗": "IR",
+    "伊拉克": "IQ",
+    "以色列": "IL",
+    "日本": "JP",
+    "约旦": "JO",
+    "科威特": "KW",
+    "老挝": "LA",
+    "黎巴嫩": "LB",
+    "中国澳门": "MO",
+    "马来西亚": "MY",
+    "马尔代夫": "MV",
+    "蒙古": "MN",
+    "尼泊尔": "NP",
+    "阿曼": "OM",
+    "巴基斯坦": "PK",
+    "巴勒斯坦": "PS",
+    "菲律宾": "PH",
+    "卡塔尔": "QA",
+    "沙特阿拉伯": "SA",
+    "新加坡": "SG",
+    "韩国": "KR",
+    "斯里兰卡": "LK",
+    "叙利亚": "SY",
+    "泰国": "TH",
+    "土耳其": "TR",
+    "阿联酋": "AE",
+    "也门": "YE",
+    "越南": "VN",
+    "中国": "CN",
+    "中国台湾": "TW",
+    "哈萨克斯坦": "KZ",
+    "吉尔吉斯斯坦": "KG",
+    "塔吉克斯坦": "TJ",
+    "土库曼斯坦": "TM",
+    "乌兹别克斯坦": "UZ",
+    "格鲁吉亚": "GE",
+    "亚美尼亚": "AM",
+    "阿塞拜疆": "AZ",
+
+    # ================= 非洲 =================
+    "阿尔及利亚": "DZ",
+    "安哥拉": "AO",
+    "贝宁": "BJ",
+    "博茨瓦纳": "BW",
+    "布隆迪": "BI",
+    "喀麦隆": "CM",
+    "佛得角": "CV",
+    "中非": "CF",
+    "乍得": "TD",
+    "科摩罗": "KM",
+    "刚果共和国": "CG",
+    "吉布提": "DJ",
+    "埃及": "EG",
+    "赤道几内亚": "GQ",
+    "埃塞俄比亚": "ET",
+    "加蓬": "GA",
+    "冈比亚": "GM",
+    "加纳": "GH",
+    "几内亚": "GN",
+    "几内亚比绍": "GW",
+    "科特迪瓦": "CI",
+    "肯尼亚": "KE",
+    "莱索托": "LS",
+    "利比里亚": "LR",
+    "利比亚": "LY",
+    "马达加斯加": "MG",
+    "马拉维": "MW",
+    "马里": "ML",
+    "毛里塔尼亚": "MR",
+    "毛里求斯": "MU",
+    "摩洛哥": "MA",
+    "莫桑比克": "MZ",
+    "纳米比亚": "NA",
+    "尼日尔": "NE",
+    "尼日利亚": "NG",
+    "卢旺达": "RW",
+    "圣多美和普林西比": "ST",
+    "塞内加尔": "SN",
+    "塞舌尔": "SC",
+    "塞拉利昂": "SL",
+    "索马里": "SO",
+    "南非": "ZA",
+    "苏丹": "SD",
+    "坦桑尼亚": "TZ",
+    "多哥": "TG",
+    "突尼斯": "TN",
+    "乌干达": "UG",
+    "布基纳法索": "BF",
+    "刚果民主共和国": "CD",
+    "赞比亚": "ZM",
+    "津巴布韦": "ZW",
+    "厄立特里亚": "ER",
+    "南苏丹": "SS",
+
+    # ================= 欧洲 =================
+    "比利时": "BE",
+    "丹麦": "DK",
+    "英国": "GB",
+    "德国": "DE",
+    "法国": "FR",
+    "爱尔兰": "IE",
+    "意大利": "IT",
+    "卢森堡": "LU",
+    "荷兰": "NL",
+    "希腊": "GR",
+    "葡萄牙": "PT",
+    "西班牙": "ES",
+    "阿尔巴尼亚": "AL",
+    "奥地利": "AT",
+    "保加利亚": "BG",
+    "芬兰": "FI",
+    "匈牙利": "HU",
+    "冰岛": "IS",
+    "列支敦士登": "LI",
+    "马耳他": "MT",
+    "挪威": "NO",
+    "波兰": "PL",
+    "罗马尼亚": "RO",
+    "瑞典": "SE",
+    "瑞士": "CH",
+    "爱沙尼亚": "EE",
+    "拉脱维亚": "LV",
+    "立陶宛": "LT",
+    "白俄罗斯": "BY",
+    "摩尔多瓦": "MD",
+    "俄罗斯": "RU",
+    "乌克兰": "UA",
+    "斯洛文尼亚": "SI",
+    "克罗地亚": "HR",
+    "捷克": "CZ",
+    "斯洛伐克": "SK",
+    "北马其顿": "MK",
+    "波斯尼亚和黑塞哥维那": "BA",
+    "梵蒂冈": "VA",
+    "塞尔维亚": "RS",
+    "黑山": "ME",
+
+    # ================= 美洲 =================
+    "安提瓜和巴布达": "AG",
+    "阿根廷": "AR",
+    "巴哈马": "BS",
+    "巴巴多斯": "BB",
+    "伯利兹": "BZ",
+    "玻利维亚": "BO",
+    "巴西": "BR",
+    "加拿大": "CA",
+    "智利": "CL",
+    "哥伦比亚": "CO",
+    "哥斯达黎加": "CR",
+    "古巴": "CU",
+    "多米尼克": "DM",
+    "多米尼加": "DO",
+    "厄瓜多尔": "EC",
+    "萨尔瓦多": "SV",
+    "格林纳达": "GD",
+    "危地马拉": "GT",
+    "圭亚那": "GY",
+    "海地": "HT",
+    "洪都拉斯": "HN",
+    "牙买加": "JM",
+    "墨西哥": "MX",
+    "尼加拉瓜": "NI",
+    "巴拿马": "PA",
+    "巴拉圭": "PY",
+    "秘鲁": "PE",
+    "圣卢西亚": "LC",
+    "圣文森特和格林纳丁斯": "VC",
+    "苏里南": "SR",
+    "特立尼达和多巴哥": "TT",
+    "美国": "US",
+    "乌拉圭": "UY",
+    "委内瑞拉": "VE",
+    "圣基茨和尼维斯": "KN",
+
+    # ================= 大洋洲 =================
+    "澳大利亚": "AU",
+    "斐济": "FJ",
+    "基里巴斯": "KI",
+    "马绍尔群岛": "MH",
+    "密克罗尼西亚联邦": "FM",
+    "瑙鲁": "NR",
+    "新西兰": "NZ",
+    "帕劳": "PW",
+    "巴布亚新几内亚": "PG",
+    "萨摩亚": "WS",
+    "所罗门群岛": "SB",
+    "汤加": "TO",
+    "图瓦卢": "TV",
+    "瓦努阿图": "VU",
+
+    # ================= 特殊地区 =================
+    "法属圭亚那": "GF",
+    "瓜德罗普": "GP",
+    "留尼汪": "RE",
+    "圣马丁": "MF",
+    "荷属圣马丁": "SX",
+    "法属波利尼西亚": "PF",
+    "新喀里多尼亚": "NC",
+    "库克群岛": "CK",
+    "关岛": "GU",
+    "波多黎各": "PR",
+    "美属萨摩亚": "AS",
+    "百慕大": "BM",
+    "开曼群岛": "KY",
+    "福克兰群岛(马尔维纳斯)": "FK",
+    "格陵兰": "GL",
+    "法属南方领地": "TF",
+    "赫德岛和麦克唐纳岛": "HM",
+    "托克劳": "TK",
+    "纽埃": "NU",
+    "诺福克岛": "NF",
+    "北马里亚纳群岛": "MP",
+    "皮特凯恩": "PN",
+    "圣赫勒拿": "SH",
+    "斯瓦尔巴群岛和扬马延岛": "SJ",
+    "东帝汶": "TL",
+    # ==== 欧洲特殊地区 ====
+    "加那利群岛": "IC",  # 西班牙特殊领土代码
+    "塞卜泰(休达)": "XC",  # 休达官方代码
+    "梅利利亚": "XL",  # 梅利利亚官方代码
+    "安道尔": "AD",
+    "直布罗陀": "GI",
+    "摩纳哥": "MC",
+    "圣马力诺": "SM",
+    "法罗群岛": "FO",  # 丹麦自治领
+    "奥兰群岛": "AX",  # 芬兰自治省
+    "格恩西": "GG",  # 英国皇家属地
+    "马恩岛": "IM",
+    "泽西": "JE",
+
+    # ==== 非洲特殊地区 ====
+    "西撒哈拉": "EH",  # 争议地区代码
+    "斯威士兰": "SZ",  # 正式国名为"Eswatini"但保留旧映射
+    "马约特": "YT",  # 法国海外省
+
+    # ==== 美洲特殊地区 ====
+    "英属印度洋领地": "IO",
+    "阿鲁巴": "AW",
+    "库拉索": "CW",
+    "马提尼克": "MQ",  # 法国海外省
+    "蒙特塞拉特": "MS",
+    "法属圣马丁": "MF",
+    "特克斯和凯科斯群岛": "TC",
+    "英属维尔京群岛": "VG",
+    "博纳尔,圣俄斯塔休斯和萨巴": "BQ",
+    "圣巴泰勒米": "BL",  # 法国海外集体
+    "美属维尔京群岛": "VI",
+    "安圭拉": "AI",
+    "圣皮埃尔和密克隆": "PM",
+
+    # ==== 大洋洲特殊地区 ====
+    "瓦利斯和富图纳": "WF",
+    "科科斯(基林)群岛": "CC",
+    "圣诞岛": "CX",
+    "美国本土外小岛屿": "UM",
+
+    # ==== 特殊标记 ====
+    "布维岛": "BV",  # 挪威属地
+    "南乔治亚岛和南桑德韦奇岛": "GS",
+    "国家(地区)不明": "XX"  # 自定义代码
+}
+
+def extract_year_month_from_path(path):
+    parts = path.parts
+    try:
+        year_part = parts[-2]
+        month_part = parts[-1]
+        if not YEAR_PATTERN.match(year_part):
+            raise ValueError(f"无效年份格式:{year_part}")
+        if not MONTH_PATTERN.match(month_part):
+            raise ValueError(f"无效月份格式:{month_part}")
+        return int(year_part), int(month_part)
+    except IndexError:
+        raise ValueError("路径结构不符合要求,示例:.../shandong/2025/04")
+
+download_dir = os.path.abspath(os.path.join('downloads'))
+download_dir_find = os.path.abspath(os.path.join('downloads/demo'))
+
+if __name__ == '__main__':
+    year, month = extract_year_month_from_path(Path(download_dir)/'2025'/'02')
+    print(year, month)

+ 242 - 0
utils/base_mysql.py

@@ -0,0 +1,242 @@
+import pymysql
+from sqlalchemy import create_engine, text
+from urllib.parse import quote_plus
+
+# 数据库配置
+DB_CONFIG = {
+    'host': '10.130.75.149',
+    'port': 3307,
+    'user': 'yto_crm',
+    'password': '%3sFUlsolaRI',
+    'database': 'crm_uat',
+    'charset': 'utf8mb4'
+}
+
+def get_commodity_id(commodity_name):
+    """根据商品名称查询数据库,获取商品 ID 和商品名称"""
+    fix_commodity_name = commodity_name
+    if commodity_name.endswith(")") or commodity_name.endswith(")"):
+        fix_commodity_name = commodity_name.rsplit("(")[0] or commodity_name.rsplit("(")[0]
+    fix_commodity_name = fix_commodity_name.replace('*', '').replace('#', '').replace('“', '').replace('”', '').replace('。', '')
+
+    try:
+        # 连接数据库
+        connection = pymysql.connect(**DB_CONFIG)
+        with connection.cursor() as cursor:
+            # 执行查询
+            sql = "SELECT e.id, e.commodity_name FROM t_yujin_crossborder_prov_commodity_category e WHERE e.commodity_name like %s"
+            cursor.execute(sql, (f"{fix_commodity_name}%",))
+            result = cursor.fetchall()
+            if result:
+                if len(result) == 1:
+                    return result[0][0], result[0][1]
+                else:
+                    print(f"查询结果为多条,商品id为:{result},fix_commodity_name:{fix_commodity_name},commodity_name: {commodity_name}")
+                    sql = "SELECT e.id, e.commodity_name FROM t_yujin_crossborder_prov_commodity_category e WHERE e.commodity_name = %s"
+                    cursor.execute(sql, (f"{fix_commodity_name}",))
+                    result = cursor.fetchone()
+                    if not result:
+                        # 用原商品名称再查一次
+                        commodity_name = commodity_name.replace("(", "(").replace(")", ")")
+                        print(f"原商品名称查询,commodity_name:{commodity_name}")
+                        sql = "SELECT e.id, e.commodity_name FROM t_yujin_crossborder_prov_commodity_category e WHERE e.commodity_name = %s"
+                        cursor.execute(sql, (f"{commodity_name}",))
+                        result = cursor.fetchone()
+                        if result:
+                            return result[0], result[1]
+                        else:
+                            return None, None
+                    else:
+                        return result[0], result[1]
+            else:
+                return None, None
+    except Exception as e:
+        print(f"查询数据库时发生异常: {str(e)}")
+        return None, None
+    finally:
+        if connection:
+            connection.close()
+
+
+def get_hs_all():
+    try:
+        # 连接数据库
+        connection = pymysql.connect(**DB_CONFIG)
+        with connection.cursor() as cursor:
+            # 执行查询
+            sql = "SELECT e.id,e.category_name FROM t_yujin_crossborder_hs_category e"
+            cursor.execute(sql)
+            all_records = cursor.fetchall()
+            if all_records:
+                return all_records
+            else:
+                return None
+    except Exception as e:
+        print(f"查询数据库时发生异常: {str(e)}")
+        return None
+    finally:
+        if connection:
+            connection.close()
+
+# 对密码进行 URL 编码
+encoded_password = quote_plus(DB_CONFIG["password"])
+
+# 构建 SQLAlchemy 引擎
+engine = create_engine(
+    f"mysql+pymysql://{DB_CONFIG['user']}:{encoded_password}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}?charset={DB_CONFIG['charset']}",
+    pool_size=5,
+    max_overflow=10
+)
+
+def bulk_insert(sql_statements):
+    """
+    批量执行 SQL 插入语句
+    :param sql_statements: 包含多个 INSERT 语句的列表
+    """
+    if not sql_statements:
+        print("未提供有效的 SQL 插入语句,跳过操作")
+        return
+
+    try:
+        with engine.connect() as conn:
+            with conn.begin():
+                for sql in sql_statements:
+                    stmt = text(sql.strip())
+                    conn.execute(stmt)
+                print(f"成功执行 {len(sql_statements)} 条 SQL 插入语句")
+    except Exception as e:
+        print(f"数据库操作失败: {str(e)}")
+        raise
+
+def update_january_yoy(prov_name):
+    """
+    更新指定省份1月份同比数据
+    :param prov_name: 省份名称,默认为福建省
+    """
+    update_sql = text("""
+                      UPDATE t_yujin_crossborder_prov_region_trade AS curr
+                          INNER JOIN t_yujin_crossborder_prov_region_trade AS prev
+                      ON curr.city_code = prev.city_code
+                          AND prev.crossborder_year_month = DATE_FORMAT(
+                          DATE_SUB(
+                          STR_TO_DATE(CONCAT(curr.crossborder_year_month, '-01'), '%Y-%m-%d'),
+                          INTERVAL 1 YEAR
+                          ),
+                          '%Y-01'
+                          )
+                          SET
+                              curr.yoy_import_export = COALESCE (
+                              ROUND(
+                              (curr.monthly_total - prev.monthly_total) / NULLIF (prev.monthly_total, 0) * 100, 4
+                              ), 0.0000
+                              ), curr.yoy_import = COALESCE (
+                              ROUND(
+                              (curr.monthly_import - prev.monthly_import) / NULLIF (prev.monthly_import, 0) * 100, 4
+                              ), 0.0000
+                              ), curr.yoy_export = COALESCE (
+                              ROUND(
+                              (curr.monthly_export - prev.monthly_export) / NULLIF (prev.monthly_export, 0) * 100, 4
+                              ), 0.0000
+                              )
+                      WHERE
+                          curr.prov_name = :prov_name
+                        AND curr.crossborder_year_month LIKE '%-01'
+                        AND curr.crossborder_year_month
+                          > '2023-01'
+                      """)
+
+    try:
+        with engine.begin() as conn:
+            result = conn.execute(update_sql, {'prov_name': prov_name})
+            print(f"Updated {result.rowcount} rows for {prov_name}")
+            return result.rowcount
+
+    except Exception as e:
+        print(f"Update failed: {str(e)}")
+        raise RuntimeError(f"同比数据更新失败: {str(e)}") from e
+
+def clear_old_shandong_yoy(prov_name):
+    """
+    清理山东省2024年前数据的同比指标
+    """
+    clear_sql = text("""
+                     UPDATE t_yujin_crossborder_prov_region_trade
+                     SET yoy_import_export = 0.0000,
+                         yoy_export        = 0.0000,
+                         yoy_import        = 0.0000
+                     WHERE prov_name = :prov_name
+                       AND crossborder_year_month < '2024-01'
+                       AND (yoy_import_export != 0 
+           OR yoy_export != 0 
+           OR yoy_import != 0) -- 优化:仅更新非零记录
+                     """)
+
+    try:
+        with engine.begin() as conn:
+            result = conn.execute(clear_sql, {'prov_name': prov_name})
+            print(f"{prov_name} 旧数据清零记录数: {result.rowcount}")
+            return result.rowcount
+    except Exception as e:
+        print(f"旧数据清零失败: {str(e)}")
+        raise
+
+def update_shandong_yoy(prov_name):
+    """
+    完整更新山东省同比数据(包含新旧数据处理)
+    """
+    try:
+        # 步骤1:清理旧数据
+        cleared = clear_old_shandong_yoy(prov_name)
+
+        # 步骤2:计算新数据
+        updated = _update_shandong_new_yoy(prov_name)
+
+        print(f"{prov_name} 同比处理完成 | 清零:{cleared} 更新:{updated}")
+        return {'cleared': cleared, 'updated': updated}
+    except Exception as e:
+        print("{prov_name} 数据处理失败", exc_info=True)
+        raise
+
+def _update_shandong_new_yoy(prov_name):
+    """
+    处理2024年及之后的山东省数据(内部方法)
+    """
+    update_sql = text("""UPDATE t_yujin_crossborder_prov_region_trade AS curr
+                                INNER JOIN t_yujin_crossborder_prov_region_trade AS prev
+                            ON curr.city_code = prev.city_code
+                                AND prev.crossborder_year_month = DATE_FORMAT(
+                                    DATE_SUB(
+                                        STR_TO_DATE(CONCAT(curr.crossborder_year_month, '-01'), '%Y-%m-%d'),
+                                        INTERVAL 1 YEAR
+                                    ),
+                                    '%Y-%m'
+                                )
+                            SET
+                                curr.yoy_import_export = COALESCE (
+                                    TRUNCATE((curr.monthly_total - prev.monthly_total) / NULLIF (prev.monthly_total, 0) * 100, 4),
+                                    0.0000
+                                ),
+                                curr.yoy_import = COALESCE (
+                                    TRUNCATE((curr.monthly_import - prev.monthly_import) / NULLIF (prev.monthly_import, 0) * 100, 4),
+                                    0.0000
+                                ),
+                                curr.yoy_export = COALESCE (
+                                    TRUNCATE((curr.monthly_export - prev.monthly_export) / NULLIF (prev.monthly_export, 0) * 100, 4),
+                                    0.0000
+                                )
+                            WHERE
+                                curr.prov_name = :prov_name
+                              AND curr.crossborder_year_month >= '2024-01'
+                              AND prev.monthly_total IS NOT NULL
+                        """)
+
+    with engine.begin() as conn:
+        result = conn.execute(update_sql, {'prov_name': prov_name})
+        print(f"{prov_name} 新数据更新数: {result.rowcount}")
+        return result.rowcount
+
+
+if __name__ == '__main__':
+    update_january_yoy('浙江省')
+    update_shandong_yoy('浙江省')
+    print("同比sql处理完成")

+ 33 - 0
utils/crawl_gov_commodity.py

@@ -0,0 +1,33 @@
+import pandas as pd
+from com.zf.crawl import base_mysql
+
+def generate_sql_from_excel(excel_file):
+    # 读取 Excel 文件
+    df = pd.read_excel(excel_file)
+
+    sql_arr = []
+    auto_increment = 0
+    code = 0
+    for index, row in df.iterrows():
+        commodity_code = str(row.values[4])
+        level = 0
+        for i in range(4):
+            if not pd.isna(row.values[i]):
+                if 0 == i:
+                    auto_increment += 1
+                    code = 'C0' + str(auto_increment)
+                level = i + 1
+                commodity_name = str(row.values[i]).rstrip('*')
+                commodity_name = commodity_name.replace('(', '(').replace(')', ')')
+        sql = f"INSERT INTO t_yujin_crossborder_prov_commodity_category (code, level,commodity_code, commodity_name, create_time) VALUES ('{code}', {level}, '{commodity_code}', '{commodity_name}', now());"
+        sql_arr.append(sql)
+        print(sql)
+
+    print(f"√ 成功生成 commodity category SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    print("trade SQL 存表完成!")
+
+# 商品书数据初始化
+excel_file = 'C:\\Users\\admin\\Desktop\\海关总署数据梳理.xlsx'  # 输入的 Excel 文件
+generate_sql_from_excel(excel_file)

+ 351 - 0
zhejiang/crawl_gov_zhejiangi_full.py

@@ -0,0 +1,351 @@
+import os
+import random
+import re
+import subprocess
+import time
+from pathlib import Path
+from urllib.parse import urljoin
+
+from faker import Faker
+from selenium import webdriver
+from selenium.common.exceptions import StaleElementReferenceException
+from selenium.webdriver import FirefoxOptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+import gov_commodity_zhejiang_city
+import gov_commodity_zhejiang_country
+import gov_commodity_zhejiang_import_export
+from utils import base_country_code, base_mysql
+
+download_dir = base_country_code.download_dir
+Path(download_dir).mkdir(parents=True, exist_ok=True)
+
+def configure_stealth_options():
+    """增强型反检测配置[1,4](@ref)"""
+    opts = FirefoxOptions()
+    print("当前下载路径:", Path(download_dir).resolve())
+    # 文件下载配置
+    opts.set_preference("browser.download.dir", download_dir)
+    opts.set_preference("browser.download.folderList", 2)
+    opts.set_preference("browser.download.manager.showWhenStarting", False)
+    opts.set_preference("browser.helperApps.neverAsk.saveToDisk",
+                        "application/octet-stream, application/vnd.ms-excel")  # 覆盖常见文件类型
+    opts.set_preference("browser.download.manager.useWindow", False)  # 禁用下载管理器窗口
+    opts.set_preference("browser.download.manager.showAlertOnComplete", False)  # 关闭完成提示
+
+    # 反检测参数
+    opts.set_preference("dom.webdriver.enabled", False)
+    opts.set_preference("useAutomationExtension", False)
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+
+    # 动态指纹
+    fake = Faker()
+    opts.set_preference("general.useragent.override", fake.firefox())
+    opts.set_preference("intl.accept_languages", "zh-CN,zh;q=0.9")
+
+    # 视口配置
+    opts.add_argument("--width=1440")
+    opts.add_argument("--height=900")
+    opts.add_argument("--headless")
+    return opts
+
+def crawl_by_year_tabs(driver, base_url):
+    """按年份Tab导航爬取数据"""
+    years = ['2023年', '2024年', '2025年']
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+    )
+
+    year_tabs = driver.find_elements(By.XPATH, '//ul[@class="nav_sj"]//li//a')
+    for tab in year_tabs:
+        year_text = tab.text.strip()
+        if year_text not in years:
+            continue
+
+        year_url = tab.get_attribute("href")
+        if not year_url.startswith(('http://', 'https://')):
+            year_url = base_url.split('//')[0] + '//' + base_url.split('/')[2] + year_url
+
+        # 新标签页打开年份页面
+        driver.execute_script("window.open(arguments[0]);", year_url)
+        driver.switch_to.window(driver.window_handles[-1])
+        print(f"\n正在处理 {year_text} 年份页面")
+
+        process_month_tabs(driver, year_text, base_url)
+
+        # 返回主窗口
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+def process_month_tabs(driver, year, base_url):
+    """处理月份Tab导航(动态获取真实存在的月份)"""
+    # ✅ 显式等待容器加载
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+    )
+
+    target_months = ['一月', '二月', '三月', '四月', '五月', '六月',
+                     '七月', '八月', '九月', '十月', '十一月', '十二月']
+    processed_months = set()  # 已处理月份记录
+    retry_count = 0
+
+    # while retry_count < 3:  # 最多重试3次
+    while True:  # 最多重试3次
+        try:
+            # 全量获取所有月份Tab
+            month_items = driver.find_elements(By.XPATH, '//ul[@class="nav_tab"]//li')
+            if not month_items:
+                print(f"{year}年没有月份Tab,停止处理")
+                break
+
+            all_found = True
+            month_text = ''
+            found = False
+            for i,item in enumerate(month_items):
+                a_tag = item.find_element(By.XPATH, './/a')
+                month_text = a_tag.text.strip()
+
+                if month_text in processed_months:
+                    continue
+
+                if not month_text in target_months:
+                    continue  # 跳过已处理月份
+
+                print(f"点击月份Tab:{year}-{month_text}")
+                a_tag.click()
+
+                # 处理详情页逻辑
+                WebDriverWait(driver, 30).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+                )
+                detail_link_arr = get_behind_detail_link(driver, base_url)
+                if not detail_link_arr:
+                    print(f"{year}-{month_text} 未找到详情链接")
+                for detail_link in detail_link_arr:
+                    print(f"{year}-{month_text} 详情链接:{detail_link}")
+                    driver.get(detail_link)
+                    download_file_from_detail_page(driver)
+                    driver.back()
+                    WebDriverWait(driver, 30).until(
+                        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+                    )
+
+                processed_months.add(month_text)
+                found = True
+
+            if not found:
+                print(f"{year}年未找到 {month_text} Tab")
+                all_found = False
+
+            if all_found:
+                print(f"{year}年所有目标月份处理完成")
+                break
+            else:
+                # 部分月份未找到,重新获取元素
+                # retry_count += 1
+                print(f"第 {retry_count} 次重试获取月份Tab...")
+                time.sleep(2)
+
+        except StaleElementReferenceException:
+            print("页面刷新,重新获取月份Tab列表...")
+            # retry_count += 1
+            time.sleep(2)
+
+    print(f"{year}年最终处理的月份:{processed_months}")
+
+def get_behind_detail_link(driver, base_url):
+   """获取点击月份Tab后 conList_ul 下所有 li 的 a 标签完整链接"""
+   href_arr = []
+   try:
+       elements = WebDriverWait(driver, 30).until(
+           EC.element_to_be_clickable((By.XPATH, '//ul[@class="conList_ul"]/li/a'))
+       )
+       elements = elements.find_elements(By.XPATH, '//ul[@class="conList_ul"]/li/a')
+       for element in elements:
+           href = element.get_attribute("href")
+           full_url = urljoin(base_url, href)  # 自动处理相对路径
+           href_arr.append(full_url)
+       return href_arr
+   except Exception as e:
+       print(f"获取详情链接失败: {str(e)}")
+       return []
+
+def download_file_from_detail_page(driver):
+    WebDriverWait(driver, 30).until(
+        EC.presence_of_element_located((By.CLASS_NAME, "portlet"))
+    )
+
+    try:
+        elements = driver.find_elements(By.XPATH, '//div[@class="easysite-news-content"]//div[@id="easysiteText"]//p//a')
+        if not elements:
+            print("详情页未找到目标文件链接")
+            return
+
+        for download_btn in elements:
+            file_name = download_btn.text.strip()
+            if not file_name:
+                continue
+            file_url = download_btn.get_attribute("href")
+
+            if not file_url.lower().endswith(('.xls', '.xlsx')):
+                print(f"跳过非 Excel 文件: {file_url}")
+                continue
+
+            print(f"正在下载: {file_name} → {file_url}")
+
+            # 记录下载前的文件列表
+            existing_files = set(f.name for f in Path(download_dir).glob('*'))
+            # 随机点击延迟
+            time.sleep(random.uniform(1, 3))
+            download_btn.click()
+
+            downloaded_file = wait_for_download_complete(existing_files=existing_files)
+
+            year, start_month, month = extract_year_and_month(file_name)
+            final_path = Path(download_dir) / year / month / f"{file_name}"
+            if os.path.exists(final_path):
+                print(f"文件已存在:{file_name} 正在覆盖...")
+                os.unlink(final_path)
+
+            final_dir = Path(download_dir) / year / month
+            final_dir.mkdir(parents=True, exist_ok=True)
+            print(f"√ 正在移动文件 {downloaded_file} 至 {final_path}")
+            downloaded_file.rename(final_path)
+            print(f"√ 下载成功:{final_path}")
+
+    except Exception as e:
+        print(f"详情页处理异常: {str(e)}")
+
+def extract_year_and_month(file_name):
+    # 支持两种格式:
+    #  - 2025年1-2月xxx
+    #  - 2025年3月xxx
+    match = re.search(r"(\d{4})年(\d{1,2})(?:-(\d{1,2}))?月", file_name)
+
+    if match:
+        year = match.group(1)
+        start_month = match.group(2)
+        end_month = match.group(3) if match.group(3) else start_month
+
+        return year, start_month.zfill(2), end_month.zfill(2)
+    else:
+        raise ValueError(f"无法从文件名中提取年份和月份:{file_name}")
+
+def extract_rar(rar_path, extract_to):
+    """备用解压函数(当 rarfile 失效时使用)"""
+    winrar_path = r"C:\Program Files\WinRAR\Rar.exe"  # 推荐使用 Rar.exe 而非 WinRAR.exe
+    cmd = [winrar_path, 'x', '-y', rar_path, str(extract_to)]
+
+    # 使用 CREATE_NO_WINDOW 防止弹出命令行窗口
+    creationflags = subprocess.CREATE_NO_WINDOW if os.name == 'nt' else 0
+
+    result = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        creationflags=creationflags  # 关键点:隐藏窗口
+    )
+
+    if result.returncode == 0:
+        print(f"解压成功: {rar_path} → {extract_to}")
+        return True
+    else:
+        print(f"解压失败: {result.stderr.decode('gbk')}")
+        return False
+
+
+def crawl_with_selenium(url):
+    driver = webdriver.Firefox(options=configure_stealth_options())
+    base_url = 'http://hangzhou.customs.gov.cn'
+
+    try:
+        # 注入反检测脚本
+        driver.execute_script("""
+            Object.defineProperty(navigator, 'webdriver', { 
+                get: () => undefined 
+            });
+            window.alert = () => {};
+        """)
+
+        # 页面加载策略
+        driver.get(url)
+
+        # 按年份导航
+        crawl_by_year_tabs(driver, base_url)
+
+    finally:
+        driver.quit()
+
+def wait_for_download_complete(timeout=30, existing_files=None):
+    """
+    监控下载目录,等待文件下载完成并返回新下载的文件。
+
+    :param timeout: 超时时间(秒)
+    :param existing_files: 下载前已存在的文件列表
+    :return: 新下载的文件路径
+    """
+    start_time = time.time()
+    temp_exts = ('.part', '.crdownload')
+
+    if existing_files is None:
+        existing_files = set(f.name for f in Path(download_dir).glob('*'))
+
+    while (time.time() - start_time) < timeout:
+        # 获取有效文件列表
+        valid_files = []
+        for f in Path(download_dir).glob('*'):
+            if (f.name not in existing_files and
+                not f.name.endswith(temp_exts) and
+                f.stat().st_size > 0):
+                valid_files.append(f)
+
+        # 等待最新文件稳定
+        if valid_files:
+            return max(valid_files, key=lambda x: x.stat().st_mtime)
+        time.sleep(2)
+    raise TimeoutError("文件下载超时")
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        # 构造完整的路径:download/shandong/2025/03
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                gov_commodity_zhejiang_import_export.process_folder(md['path'])
+                gov_commodity_zhejiang_country.process_folder(md['path'])
+                gov_commodity_zhejiang_city.process_folder(md['path'])
+
+if __name__ == "__main__":
+    crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html')
+    print(f"浙江杭州海关全量数据下载任务完成")
+    # 等待5s后执行
+    time.sleep(5)
+    hierarchical_traversal(base_country_code.download_dir)
+    print("浙江杭州海关类章、国家、城市所有文件处理完成!")
+    time.sleep(5)
+    base_mysql.update_january_yoy('浙江省')
+    base_mysql.update_shandong_yoy('浙江省')
+    print("浙江杭州海关城市同比sql处理完成")
+

+ 173 - 0
zhejiang/gov_commodity_zhejiang_city.py

@@ -0,0 +1,173 @@
+import time
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+city_code_map = {
+    "杭州地区": "330100",
+    "宁波地区": "330200",
+    "温州地区": "330300",
+    "绍兴地区": "330400",
+    "湖州地区": "330500",
+    "嘉兴地区": "330600",
+    "金华地区": "330700",
+    "衢州地区": "330800",
+    "舟山地区": "330900",
+    "台州地区": "331000",
+    "丽水地区": "331100"
+}
+
+def get_df(path, year_month):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    file_path = file_paths[0]
+    sheet_name = base_country_code.find_sheet_by_keyword(file_path, "十一地市")
+
+    flag = True
+    if not sheet_name:
+        print(f"{file_path} 未找到包含 十一地市 sheet")
+        # 23年1-11月数据要在多文件里找
+        for file_path in file_paths:
+            if '十一地市' in file_path.name:
+                file_path = file_path
+                flag = False
+                break
+
+    if not sheet_name and flag:
+        print(f"{path} 未找到包含 十一地市 sheet或文件")
+        return None
+    if flag:
+        xls = pd.ExcelFile(file_path)
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
+    else:
+        df = pd.read_excel(file_path, header=None)
+
+    import_df = pd.DataFrame()
+    export_df = pd.DataFrame()
+    total_df = pd.DataFrame()
+    temp_df = df[[1, 2]].rename(columns={1: 'commodity', 2: 'total'})
+    temp_df['total'] = pd.to_numeric(temp_df['total'].replace('--', 0), errors='coerce').astype(float)
+    if temp_df['total'] and year_month and year_month == '2024-07':
+        temp_df['total'] = temp_df['total'] / 10000
+    total_df = pd.concat([total_df, temp_df])
+
+    temp_df = df[[1, 3]].rename(columns={1: 'commodity', 3: 'import'})
+    temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce').astype(float)
+    if temp_df['import'] and year_month and year_month == '2024-07':
+        temp_df['import'] = temp_df['import'] / 10000
+    import_df = pd.concat([import_df, temp_df])
+
+    temp_df = df[[1, 4]].rename(columns={1: 'commodity', 4: 'export'})
+    temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce').astype(float)
+    if temp_df['export'] and year_month and year_month == '2024-07':
+        temp_df['export'] = temp_df['export'] / 10000
+    export_df = pd.concat([export_df, temp_df])
+
+    return import_df, export_df, total_df
+
+def process_folder(path):
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    sql_arr = []
+    res = get_df(path, None)
+    if res is None:
+        print(f"{year_month} prov_region_trade 未找到包含 地市 sheet")
+        return
+    import_df, export_df, total_df = res
+    # 当月数据分组清洗
+    curr_import = import_df.groupby('commodity')['import'].sum().reset_index()
+    curr_export = export_df.groupby('commodity')['export'].sum().reset_index()
+    total_df = total_df.groupby('commodity')['total'].sum().reset_index()
+
+    if not month == 1:
+        previous_month_dir = base_country_code.get_previous_month_dir(path)
+        res = get_df(previous_month_dir, year_month)
+        if res is None:
+            print(f"{path} 上月目录里文件未找到包含 地市 sheet")
+            return
+        prev_import_df, prev_export_df, prev_total_df = res
+
+        # 上月数据分组
+        prev_import = prev_import_df.groupby('commodity')['import'].sum().reset_index()
+        prev_export = prev_export_df.groupby('commodity')['export'].sum().reset_index()
+        prev_total_df = prev_total_df.groupby('commodity')['total'].sum().reset_index()
+
+        # 差值计算
+        curr_import = pd.merge(curr_import, prev_import, on='commodity', how='left')
+        curr_import['import'] = round(curr_import['import_x'] - curr_import['import_y'], 4)
+
+        curr_export = pd.merge(curr_export, prev_export, on='commodity', how='left')
+        curr_export['export'] = round(curr_export['export_x'] - curr_export['export_y'], 4)
+
+        total_df = pd.merge(total_df, prev_total_df, on='commodity', how='left')
+        total_df['total'] = round(total_df['total_x'] - total_df['total_y'], 4)
+        print(f"合并文件: {path}*********{previous_month_dir}")
+
+    # 合并进出口数据
+    merged_df = pd.merge(curr_import, curr_export, on='commodity', how='outer')
+    merged_df = pd.merge(merged_df, total_df, on='commodity', how='outer')
+
+    for _, row in merged_df.iterrows():
+        city_name = str(row['commodity']).strip()
+        city_code = city_code_map.get(city_name)
+        if not city_code:
+            print(f"未找到省 '{city_name}' 对应市编码")
+            continue
+
+        # 提取数据并格式化
+        if year == 2025 or (year == 2024 and month in [7, 8, 9, 10, 11, 12]):
+            monthly_import = round(row['import'], 4)
+            monthly_export = round(row['export'], 4)
+            monthly_total = round(row['total'], 4)
+        else:
+            monthly_import = round(row['import'] / 10000, 4)
+            monthly_export = round(row['export'] / 10000, 4)
+            monthly_total = round(row['total'] / 10000, 4)
+        yoy_import_export, yoy_import, yoy_export = 0, 0, 0
+
+        # 组装 SQL 语句
+        sql = (f"INSERT INTO t_yujin_crossborder_prov_region_trade "
+               f"(crossborder_year, crossborder_year_month, prov_code, prov_name, city_code, city_name, monthly_total, monthly_export, monthly_import,yoy_import_export, yoy_import, yoy_export, create_time) VALUES "
+               f"('{year}', '{year_month}', '330000', '浙江省', '{city_code}', '{city_name}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', '{yoy_export}', now());\n")
+        sql_arr.append(sql)
+
+    print(f"√ {year_month} prov_region_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_region_trade SQL 存表完成!")
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+    print(f"浙江杭州海关城市所有文件处理完成!")
+    time.sleep(5)
+    base_mysql.update_january_yoy('浙江省')
+    base_mysql.update_shandong_yoy('浙江省')
+    print("同比sql处理完成")
+    # root = Path(base_country_code.download_dir)/'2024'/'07'
+    # process_folder(root)

+ 179 - 0
zhejiang/gov_commodity_zhejiang_country.py

@@ -0,0 +1,179 @@
+from pathlib import Path
+
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+# 排除地区名单
+EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
+                   "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
+                   "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
+
+def get_df_country(path, year_month):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+
+    file_path = file_paths[0]
+    print(f"处理文件: {file_path.name}")
+
+    xls = pd.ExcelFile(file_path)
+    import_df = pd.DataFrame()
+    export_df = pd.DataFrame()
+    total_df = pd.DataFrame()
+
+    flag = True
+    sheet_name = base_country_code.find_sheet_by_keyword(file_path, "国别")
+    if not sheet_name:
+        print(f"{file_path} 未找到包含 国别 sheet")
+        sheet_name = base_country_code.find_sheet_by_keyword(file_path, "组织")
+        if not sheet_name:
+            print(f"{file_path} 未找到包含 组织 sheet")
+            # 23年1-11月数据要在多文件里找
+            for file_path in file_paths:
+                if '洲贸组织' in file_path.name:
+                    file_path = file_path
+                    flag = False
+                    break
+    if not sheet_name and flag:
+        print(f"{path} 未找到包含 国别 | 组织 | 洲贸组织 sheet或文件")
+        return None
+    if flag:
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
+    else:
+        df = pd.read_excel(file_path, header=None)
+    temp_df = df[[0, 1]].rename(columns={0: 'commodity', 1: 'total'})
+    temp_df['total'] = pd.to_numeric(temp_df['total'].replace('--', 0), errors='coerce').astype(float)
+    if year_month and year_month == '2024-07':
+        temp_df['total'] = temp_df['total'] / 10000
+    total_df = pd.concat([total_df, temp_df])
+
+    temp_df = df[[0, 2]].rename(columns={0: 'commodity', 2: 'import'})
+    temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce').astype(float)
+    if year_month and year_month == '2024-07':
+        temp_df['import'] = temp_df['import'] / 10000
+    import_df = pd.concat([import_df, temp_df])
+
+    temp_df = df[[0, 3]].rename(columns={0: 'commodity', 3: 'export'})
+    temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce').astype(float)
+    if year_month and year_month == '2024-07':
+        temp_df['export'] = temp_df['export'] / 10000
+    export_df = pd.concat([export_df, temp_df])
+
+    return import_df, export_df, total_df
+
+def process_folder(path):
+    res = get_df_country(path, None)
+    if not res:
+        print(f"{path} 目录里文件未找到包含 国别 sheet")
+        return
+    import_df, export_df, total_df = res
+
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    # 当月数据分组清洗
+    curr_import = import_df.groupby('commodity')['import'].sum().reset_index()
+    curr_export = export_df.groupby('commodity')['export'].sum().reset_index()
+    total_df = total_df.groupby('commodity')['total'].sum().reset_index()
+
+    if not month == 1:
+        previous_month_dir = base_country_code.get_previous_month_dir(path)
+        res = get_df_country(previous_month_dir, year_month)
+        if not res:
+            print(f"{path} 上月目录里文件未找到包含 国别 sheet")
+            return
+        prev_import_df, prev_export_df, prev_total_df = res
+
+        # 上月数据分组
+        prev_import = prev_import_df.groupby('commodity')['import'].sum().reset_index()
+        prev_export = prev_export_df.groupby('commodity')['export'].sum().reset_index()
+        prev_total_df = prev_total_df.groupby('commodity')['total'].sum().reset_index()
+
+        # 差值计算
+        curr_import = pd.merge(curr_import, prev_import, on='commodity', how='left')
+        curr_import['import'] = round(curr_import['import_x'] - curr_import['import_y'], 4)
+
+        curr_export = pd.merge(curr_export, prev_export, on='commodity', how='left')
+        curr_export['export'] = round(curr_export['export_x'] - curr_export['export_y'], 4)
+
+        total_df = pd.merge(total_df, prev_total_df, on='commodity', how='left')
+        total_df['total'] = round(total_df['total_x'] - total_df['total_y'], 4)
+        print(f"合并文件: {path}*********{previous_month_dir}")
+
+    # 合并进出口数据
+    merged_df = pd.merge(curr_import, curr_export, on='commodity', how='outer')
+    merged_df = pd.merge(merged_df, total_df, on='commodity', how='outer')
+
+    sql_arr = []
+    # try:
+    for _, row in merged_df.iterrows():
+        country_name = str(row['commodity']).strip()
+        if country_name.endswith(")") or country_name.endswith(")"):
+            country_name = country_name.rsplit("(")[0] or country_name.rsplit("(")[0]
+        # 过滤掉排除地区
+        if country_name in EXCLUDE_REGIONS:
+            continue
+        # 获取国家编码
+        country_code = base_country_code.COUNTRY_CODE_MAPPING.get(country_name)
+        if not country_code:
+            print(f"{year_month} 未找到国家 '{country_name}' 对应的编码")
+            continue
+
+        # 提取数据并格式化
+        if year == 2025 or (year == 2024 and month in [7, 8, 9, 10, 11, 12]):
+            monthly_import = round(row['import'], 4)
+            monthly_export = round(row['export'], 4)
+            monthly_total = round(row['total'], 4)
+        else:
+            monthly_import = round(row['import'] / 10000, 4)
+            monthly_export = round(row['export'] / 10000, 4)
+            monthly_total = round(row['total'] / 10000, 4)
+        yoy_import_export, yoy_import, yoy_export = 0, 0, 0
+        # 构建 SQL
+        sql = (
+            f"INSERT INTO t_yujin_crossborder_prov_country_trade "
+            f"(crossborder_year, crossborder_year_month, prov_code, prov_name, country_code, country_name, "
+            f"monthly_total, monthly_export, monthly_import, yoy_import_export, yoy_import, yoy_export, create_time) "
+            f"VALUES ('{year}', '{year_month}', '330000', '浙江省', '{country_code}', '{country_name}', "
+            f"{format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, '{yoy_import_export}', '{yoy_import}', "
+            f"'{yoy_export}', NOW());"
+        )
+        sql_arr.append(sql)
+    # except Exception as e:
+    #     print(f"{year_month} 处理时发生异常: {str(e)}")
+
+    print(f"√ {year_month} 成功生成 SQL 条数: {len(sql_arr)}")
+    # 批量插入数据库
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_country_trade SQL 存表完成!\n")
+
+def hierarchical_traversal(root_path):
+    root = Path(root_path)
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({"path": item, "month": int(item.name)})
+
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+
+if __name__ == '__main__':
+    # hierarchical_traversal(base_country_code.download_dir)
+
+    root = Path(base_country_code.download_dir) / '2024' / '07'
+    process_folder(root)
+    print("浙江杭州海关国别所有文件处理完成!")

+ 231 - 0
zhejiang/gov_commodity_zhejiang_import_export.py

@@ -0,0 +1,231 @@
+from pathlib import Path
+
+import re
+import pandas as pd
+
+from utils import base_country_code, base_mysql
+from utils.base_country_code import format_sql_value
+
+CUSTOM_COMMODITY_REPLACEMENTS = {
+    '稻谷及大米': '稻谷、大米及大米粉',
+    '有机发光二极管平板显示模组': '有机发光二极管(OLED)平板显示模组',
+}
+
+PRESERVE_PARENTHESES_KEYWORDS = {
+    '汽车(包括底盘)',
+}
+
+def clean_commodity_name(name, preserve_keywords=None):
+    """
+    自定义清洗商品名称逻辑,支持条件保留中文括号内容
+
+    :param name: 商品名称字符串
+    :param preserve_keywords: 需要保留括号的关键词集合
+    :return: 清洗后的商品名称
+    """
+    name = str(name).strip().replace('(', '(').replace(')', ')')
+
+    # 去除非必要符号
+    name = re.sub(r'[#*?]', '', name)
+    name = re.sub(r'_x000D_', '', name)
+
+    # 判断是否需要保留括号内容
+    if preserve_keywords:
+        for keyword in preserve_keywords:
+            if keyword == name:
+                # 匹配到关键词时,不移除括号内容
+                return name
+
+    # 默认移除中文括号及内容
+    name = re.sub(r'([^)]*)', '', name)
+    return name.strip()
+
+def get_df_import_export(path, year_month):
+    file_paths = list(Path(path).glob('*'))
+    if not file_paths:
+        print("未找到任何文件")
+        return None
+    file_path = file_paths[0]
+    print(f"处理文件: {file_path.name}")
+
+    xls = pd.ExcelFile(file_path)
+    import_df = pd.DataFrame()
+    export_df = pd.DataFrame()
+
+    flag = True
+    sheet_name = base_country_code.find_sheet_by_keyword(file_path, "主出商品")
+    if not sheet_name:
+        print(f"{file_path} 单文件未找到包含 主出商品 sheet")
+        # 23年1-11月数据要在多文件里找
+        for file_path in file_paths:
+            if '主要出口商品' in file_path.name:
+                file_path = file_path
+                flag = False
+                break
+    if not sheet_name and flag:
+        print(f"{path} 中未找到 主出商品 sheet或文件")
+        return None
+
+    if flag:
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None).iloc[2:]
+    else:
+        df = pd.read_excel(file_path, header=None).iloc[1:]
+    temp_df = df[[0, 1]].rename(columns={0: 'commodity', 1: 'export'})
+    temp_df['commodity'] = (
+        temp_df['commodity']
+        .astype(str)
+        .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+        .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+    )
+    temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce').astype(float)
+    if year_month and year_month == '2024-07':
+        temp_df['export'] = temp_df['export'] / 10000
+    export_df = pd.concat([export_df, temp_df])
+
+    flag_2 = True
+    sheet_name = base_country_code.find_sheet_by_keyword(file_path, "主进商品")
+    if not sheet_name:
+        print(f"{file_path} 单文件未找到包含 主进商品 sheet")
+        # 23年1-11月数据要在多文件里找
+        for file_path in file_paths:
+            if '主要进口商品' in file_path.name:
+                file_path = file_path
+                flag_2 = False
+                break
+    if not sheet_name and flag_2:
+        print(f"{path} 中未找到 主进商品 sheet或文件")
+        return None
+
+    if flag_2:
+        df = pd.read_excel(xls, sheet_name=sheet_name, header=None).iloc[2:]
+    else:
+        df = pd.read_excel(file_path, header=None).iloc[1:]
+    temp_df = df[[0, 1]].rename(columns={0: 'commodity', 1: 'import'})
+    temp_df['commodity'] = (
+        temp_df['commodity']
+        .astype(str)
+        .apply(lambda x: clean_commodity_name(x, preserve_keywords=PRESERVE_PARENTHESES_KEYWORDS))
+        .replace(CUSTOM_COMMODITY_REPLACEMENTS, regex=False)
+    )
+    temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce').astype(float)
+    if year_month and year_month == '2024-07':
+        temp_df['import'] = temp_df['import'] / 10000
+    import_df = pd.concat([import_df, temp_df])
+
+    return import_df, export_df
+
+
+def process_folder(path):
+    res = get_df_import_export(path, None)
+    if not res:
+        print(f"{path} 目录里文件未找到包含 主出、主进商品 sheet")
+        return
+    import_df, export_df = res
+
+    year, month = base_country_code.extract_year_month_from_path(path)
+    year_month = f'{year}-{month:02d}'
+
+    # 当月数据分组清洗
+    curr_import = import_df.groupby('commodity')['import'].sum().reset_index()
+    curr_export = export_df.groupby('commodity')['export'].sum().reset_index()
+
+    if not month == 1:
+        previous_month_dir = base_country_code.get_previous_month_dir(path)
+        res = get_df_import_export(previous_month_dir, year_month)
+        if not res:
+            print(f"{path} 上月目录里文件未找到包含 主出、主进商品 sheet")
+            return
+        prev_import_df, prev_export_df = res
+
+        # 上月数据分组
+        prev_import = prev_import_df.groupby('commodity')['import'].sum().reset_index()
+        prev_export = prev_export_df.groupby('commodity')['export'].sum().reset_index()
+
+        # 差值计算
+        curr_import = pd.merge(curr_import, prev_import, on='commodity', how='left')
+        curr_import['import'] = round(curr_import['import_x'] - curr_import['import_y'], 4)
+
+        curr_export = pd.merge(curr_export, prev_export, on='commodity', how='left')
+        curr_export['export'] = round(curr_export['export_x'] - curr_export['export_y'], 4)
+        print(f"合并文件: {path}*********{previous_month_dir}")
+
+    # 合并进出口数据
+    merged_df = pd.merge(curr_import, curr_export, on='commodity', how='outer')
+    save_to_database(merged_df, year, month)
+
+def save_to_database(merged_df, year, month):
+    year_month = f'{year}-{month:02d}'
+    processed_commodities = set()
+    sql_arr = []
+    try:
+        for _, row in merged_df.iterrows():
+            commodity_name = str(row['commodity']).strip()
+            if commodity_name == '消费品' or commodity_name == '劳动密集型产品':
+                print(f'{commodity_name} 商品不存在,跳过')
+                continue
+            commodity_code, commodity_name_fix = base_mysql.get_commodity_id(commodity_name)
+            if not commodity_code:
+                print(f"未找到商品名称 '{commodity_name}' 对应的 ID")
+                continue
+            if not commodity_name_fix or commodity_name_fix in processed_commodities:
+                print(f"已处理过 '{commodity_name_fix}',传入name:{commodity_name}")
+                continue
+
+            if year == 2025 or (year == 2024 and month in [7, 8, 9, 10, 11, 12]):
+                monthly_import = round(row['import'], 4)
+                monthly_export = round(row['export'], 4)
+                monthly_total = round(monthly_import + monthly_export, 4)
+            else:
+                monthly_import = round(row['import'] / 10000, 4)
+                monthly_export = round(row['export'] / 10000, 4)
+                monthly_total = round((monthly_import + monthly_export) / 10000, 4)
+
+            sql = (f"INSERT INTO t_yujin_crossborder_prov_commodity_trade "
+                   f"(crossborder_year, crossborder_year_month, prov_code, prov_name, commodity_code, commodity_name, monthly_total, monthly_export, monthly_import, create_time) VALUES "
+                   f"('{year}', '{year_month}', '330000', '浙江省', '{commodity_code}', '{commodity_name_fix}', {format_sql_value(monthly_total)}, {format_sql_value(monthly_export)}, {format_sql_value(monthly_import)}, now());")
+            sql_arr.append(sql)
+
+            processed_commodities.add(commodity_name_fix)
+            # print(f'{commodity_name} -> {commodity_name_fix}')
+
+    except Exception as e:
+        print(f"{year_month} prov_commodity_trade 生成 SQL 文件时发生异常: {str(e)}")
+
+    print(f"√ {year_month} prov_commodity_trade 成功生成 SQL 文件 size {len(sql_arr)} ")
+    # 解析完后生成sql文件批量入库
+    base_mysql.bulk_insert(sql_arr)
+    print(f"√ {year_month} prov_commodity_trade SQL 存表完成!\n")
+
+def hierarchical_traversal(root_path):
+    """分层遍历:省份->年份->月目录"""
+    root = Path(root_path)
+    # 获取所有年份目录
+    year_dirs = [
+        item for item in root.iterdir()
+        if item.is_dir() and base_country_code.YEAR_PATTERN.match(item.name)
+    ]
+
+    # 按年倒序
+    for year_dir in sorted(year_dirs, key=lambda x: x.name, reverse=True):
+        print(f"\n年份:{year_dir.name} | 省份:jiangsu")
+
+        # 提取月份目录
+        month_dirs = []
+        for item in year_dir.iterdir():
+            if item.is_dir() and base_country_code.MONTH_PATTERN.match(item.name):
+                month_dirs.append({
+                    "path": item,
+                    "month": int(item.name)
+                })
+        # 按月倒序输出
+        if month_dirs:
+            for md in sorted(month_dirs, key=lambda x: x["month"], reverse=True):
+                print(f"  月份:{md['month']:02d} | 路径:{md['path']}")
+                process_folder(md['path'])
+
+if __name__ == '__main__':
+    hierarchical_traversal(base_country_code.download_dir)
+
+    # root = Path(base_country_code.download_dir)/'2023'/'01'
+    # process_folder(root)
+    print("浙江杭州海关类章所有文件处理完成!")