Bladeren bron

海关总署增加收发件人数据清洗入库兼容地级市数据逻辑

01495251 1 maand geleden
bovenliggende
commit
abbc49f493

+ 4 - 2
crossborder/cli.py

@@ -2,6 +2,8 @@ import argparse
 from importlib import import_module
 import sys
 
+from crossborder.utils.log import log
+
 PROVINCE_MODULES = {
     "shandong": "crossborder.shandong.selenium_shandong_download",
     "guangdong": "crossborder.guangdong.selenium_guangdong_download",
@@ -18,7 +20,7 @@ PROVINCE_MODULES = {
 def run_province(name, year=None):
     try:
         module = import_module(PROVINCE_MODULES[name])
-        print(f"✅ 正在运行 {name} 模块...")
+        log.info(f"✅ 正在运行 {name} 模块...")
 
         # 构造模拟的 sys.argv
         fake_argv = ['script_name']
@@ -29,7 +31,7 @@ def run_province(name, year=None):
         module.main()
 
     except Exception as e:
-        print(f"❌ {name} 执行失败: {e}")
+        log.error(f"❌ {name} 执行失败: {e}")
 
 def main():
     parser = argparse.ArgumentParser(description="跨省数据采集器")

+ 24 - 19
crossborder/fujian/selenium_fujian_download.py

@@ -11,7 +11,9 @@ from selenium.webdriver.support.ui import WebDriverWait
 
 from crossborder.fujian.fujian_parse_excel import parse_excel
 from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.db_helper import DBHelper
 from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
+from crossborder.utils.log import log
 from crossborder.utils.parse_utils import traverse_and_process
 
 # 基础配置
@@ -36,10 +38,10 @@ def detect_latest_month(driver):
             WebDriverWait(driver, 10).until(
                 EC.presence_of_element_located((By.XPATH, f'//a[contains(@title, "{target_title}")]'))
             )
-            print(f"已找到最新月份数据 {check_year}-{check_month}")
+            log.info(f"已找到最新月份数据 {check_year}-{check_month}")
             return check_year, check_month
         except:
-            print(f"未找到 {target_title}")
+            log.info(f"未找到 {target_title}")
             continue
     raise Exception("三个月内未找到有效数据")
 
@@ -71,15 +73,15 @@ def process_month_data(driver, year, month):
                     time.sleep(random.uniform(0.5, 1.5))  # 成功后等待
                 except Exception as e:
                     retry += 1
-                    print(f"下载 {title} 失败(第{retry}次重试): {str(e)}")
+                    log.error(f"下载 {title} 失败(第{retry}次重试): {str(e)}")
                     if retry >= MAX_RETRY:
-                        print(f"❌ 超出最大重试次数,跳过该文件:{title}")
+                        log.error(f"❌ 超出最大重试次数,跳过该文件:{title}")
                         return 1000
                     else:
-                        print(f"🔄 第{retry}次重试:{title}")
+                        log.error(f"🔄 第{retry}次重试:{title}")
                         time.sleep(random.uniform(2, 4))  # 重试前随机等待
 
-    print(f"本页找到{found_count}个有效表格")
+    log.info(f"本页找到{found_count}个有效表格")
     return found_count
 
 
@@ -89,7 +91,7 @@ def reverse_crawler(driver, target_months):
     # target_months = [(2023, 5), (2023, 4)]
     page = 1
     for year, month in target_months:
-        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+        log.info(f"\n开始处理 {year}年{month}月数据".center(50, "="))
 
         WebDriverWait(driver, 15).until(
             EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
@@ -104,22 +106,22 @@ def reverse_crawler(driver, target_months):
 
             try:
                 # 动态检测当前页面月份
-                print(f"当前页面:{driver.current_url}, 第{page}页")
+                log.info(f"当前页面:{driver.current_url}, 第{page}页")
 
                 # 处理当前页面的表格数据
                 found = process_month_data(driver, year, month)
                 found_tables += found
 
                 if found_tables == 1000:
-                    print(f"❌{year}年{month}月数据采集失败,跳过当前月")
+                    log.error(f"❌{year}年{month}月数据采集失败,跳过当前月")
                     break
 
                 # 完成四个表格采集
                 if found_tables >= 1:
-                    print(f"已完成{year}年{month}月全部表格采集")
+                    log.info(f"已完成{year}年{month}月全部表格采集")
                     processed_months.add((year, month))
                     break
-                print(f"第{page}页已采集表格数:{found_tables}/1,前往下一页采集")
+                log.info(f"第{page}页已采集表格数:{found_tables}/1,前往下一页采集")
                 # 分页操作(增强定位稳定性)
                 WebDriverWait(driver, 15).until(
                     EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
@@ -130,10 +132,10 @@ def reverse_crawler(driver, target_months):
 
 
             except TimeoutException:
-                print(f"未找到更多分页,已采集表格数:{found_tables}/1")
+                log.error(f"未找到更多分页,已采集表格数:{found_tables}/1")
                 break
             except Exception as e:
-                print(f"分页异常:{str(e)}")
+                log.error(f"分页异常:{str(e)}")
                 handle_retry(driver)  # 异常恢复函数
                 break
 
@@ -153,9 +155,9 @@ def handle_retry(driver):
         WebDriverWait(driver, 15).until(
             EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
         )
-        print("浏览器异常已恢复")
+        log.error("浏览器异常已恢复")
     except:
-        print("需要人工干预的严重错误")
+        log.error("需要人工干预的严重错误")
         raise
 
 
@@ -169,7 +171,7 @@ def main():
     try:
         # 智能检测最新有效月份
         valid_year, valid_month = detect_latest_month(driver)
-        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+        log.info(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
 
         # 生成目标序列
         if args.year:
@@ -185,15 +187,18 @@ def main():
             # 未指定年份时:取最近两个月
             target_months = generate_month_sequence(valid_year, valid_month)
 
-        print(f"目标采集月份序列:{target_months}")
+        log.info(f"目标采集月份序列:{target_months}")
         reverse_crawler(driver, target_months)
-        print(f"{len(target_months)}个月份数据已采集完毕")
+        log.info(f"{len(target_months)}个月份数据已采集完毕")
 
     finally:
         if 'driver' in locals():
             driver.quit()
-        print("\n数据清洗入库中...")
+        log.info("\n数据清洗入库中...")
         traverse_and_process(download_dir, parse_excel, province_name="fujian")
+        log.info("\n福建省地级市数据同比更新中...")
+        db_helper = DBHelper()
+        db_helper.update_prov_yoy("福建省")
 
 
 if __name__ == "__main__":

+ 2 - 1
crossborder/guangdong/guangdong_gongbei_parse_excel.py

@@ -4,9 +4,10 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 from crossborder.utils.db_helper import DBHelper
-from crossborder.quanguo.detail import parse_value
+
 from crossborder.utils.constants import GUANGDONG_CITY
 from crossborder.utils.log import log
+from crossborder.utils.parse_utils import parse_value
 
 PROV_CODE = "440000"
 PROV_NAME = "广东省"

+ 21 - 20
crossborder/henan/selenium_henan_download.py

@@ -15,6 +15,7 @@ from selenium.webdriver.support.ui import WebDriverWait
 from crossborder.henan.henan_parse_excel import parse_excel
 from crossborder.utils.constants import DOWNLOAD_DIR
 from crossborder.utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence
+from crossborder.utils.log import log
 from crossborder.utils.parse_utils import traverse_and_process
 
 # 基础配置
@@ -26,7 +27,7 @@ download_dir = DOWNLOAD_DIR / "henan"
 
 
 
-
+#####河南省海关无地级市进出口数据,次数地级市数去从海关总署取
 
 def detect_latest_month(driver):
     """三级回溯智能检测最新有效月份(使用正则简化匹配)"""
@@ -52,12 +53,12 @@ def detect_latest_month(driver):
             for element in elements:
                 title = element.get_attribute("title")
                 if pattern.search(title):
-                    print(f"已找到最新月份数据 {check_year}-{check_month}")
+                    log.info(f"已找到最新月份数据 {check_year}-{check_month}")
                     return check_year, check_month
 
-            print(f"未找到匹配项(正则:{pattern.pattern})")
+            log.info(f"未找到匹配项(正则:{pattern.pattern})")
         except TimeoutException:
-            print(f"页面加载超时或无匹配项({check_year}-{check_month})")
+            log.error(f"页面加载超时或无匹配项({check_year}-{check_month})")
             continue
 
     raise Exception("三个月内未找到有效数据")
@@ -102,14 +103,14 @@ def process_month_data(driver, year, month):
                     success = True  # 成功则跳出循环
                 except Exception as e:
                     retry += 1
-                    print(f"下载 {title} 失败(第{retry}次重试): {e}")
-                    traceback.print_exc()
+                    log.error(f"下载 {title} 失败(第{retry}次重试): {e}")
+                    traceback.log.info_exc()
                     if retry < max_retries:
                         time.sleep(random.uniform(2, 5))  # 随机等待后再试
                     else:
-                        print(f"{title} 下载已达到最大重试次数,跳过该文件。")
+                        log.error(f"{title} 下载已达到最大重试次数,跳过该文件。")
 
-    print(f"本页找到{found_count}个有效表格")
+    log.error(f"本页找到{found_count}个有效表格")
     return found_count
 
 
@@ -119,7 +120,7 @@ def reverse_crawler(driver, target_months):
     # target_months = [(2023, 5), (2023, 4)]
     page = 1
     for year, month in target_months:
-        print(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+        log.info(f"\n开始处理 {year}年{month}月数据".center(50, "="))
 
         WebDriverWait(driver, 15).until(
             EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
@@ -133,18 +134,18 @@ def reverse_crawler(driver, target_months):
             random_sleep(base=2, variance=3)
 
             try:
-                print(f"当前页面:{driver.current_url}, 第{page}页")
+                log.info(f"当前页面:{driver.current_url}, 第{page}页")
                 # 处理当前页面的表格数据
                 found = process_month_data(driver, year, month)
                 found_tables += found
 
                 # 完成四个表格采集
                 if found_tables >= 3:
-                    print(f"已完成{year}年{month}月全部表格采集")
+                    log.info(f"已完成{year}年{month}月全部表格采集")
                     processed_months.add((year, month))
                     break
 
-                print(f"第{page}页已采集表格数:{found_tables}/3,前往下一页采集")
+                log.info(f"第{page}页已采集表格数:{found_tables}/3,前往下一页采集")
                 # 分页操作(增强定位稳定性)
                 WebDriverWait(driver, 15).until(
                     EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"下一页")]'))
@@ -155,10 +156,10 @@ def reverse_crawler(driver, target_months):
 
 
             except TimeoutException:
-                print(f"未找到更多分页,已采集表格数:{found_tables}/3")
+                log.error(f"未找到更多分页,已采集表格数:{found_tables}/3")
                 break
             except Exception as e:
-                print(f"分页异常:{str(e)}")
+                log.error(f"分页异常:{str(e)}")
                 handle_retry(driver)  # 异常恢复函数
                 break
 
@@ -188,9 +189,9 @@ def handle_retry(driver):
         WebDriverWait(driver, 15).until(
             EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))
         )
-        print("浏览器异常已恢复")
+        log.info("浏览器异常已恢复")
     except:
-        print("需要人工干预的严重错误")
+        log.error("需要人工干预的严重错误")
         raise
 
 
@@ -204,7 +205,7 @@ def main():
     try:
         # 智能检测最新有效月份
         valid_year, valid_month = detect_latest_month(driver)
-        print(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
+        log.info(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
 
         # 生成目标序列
         if args.year:
@@ -219,13 +220,13 @@ def main():
             # 未指定年份时:取最近两个月
             target_months = generate_month_sequence(valid_year, valid_month)
 
-        print(f"目标采集月份序列:{target_months}")
+        log.info(f"目标采集月份序列:{target_months}")
         reverse_crawler(driver, target_months)
-        print(f"{len(target_months)}个月份数据已采集完毕")
+        log.info(f"{len(target_months)}个月份数据已采集完毕")
 
     finally:
         driver.quit()
-        print("\n数据清洗入库中...")
+        log.info("\n数据清洗入库中...")
         traverse_and_process(download_dir, parse_excel, province_name="henan")
 
 

+ 67 - 12
crossborder/quanguo/data_cleaning_to_db.py

@@ -1,5 +1,6 @@
 import os
 import re
+from datetime import datetime
 from pathlib import Path
 
 from crossborder.quanguo.parse_commodity_country_detail_excel import parse_commodity_country_detail
@@ -9,6 +10,7 @@ from crossborder.quanguo.parse_month_excel import parse_month_table_excel
 from crossborder.quanguo.parse_region_table_excel import parse_region_table_excel
 from crossborder.quanguo.parse_year_excel import parse_year_table_excel
 from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.db_helper import DBHelper
 from crossborder.utils.log import log
 
 
@@ -17,16 +19,6 @@ def perform_data_cleanup_and_import(current_year):
     数据清洗与入库主函数
     :param current_year: 当前年份,用于定位数据目录
     """
-    # from crossborder.quanguo.parse_year_excel.py import (
-    #     parse_year_table_excel,
-    #     parse_month_table_excel,
-    #     parse_country_table_excel,
-    #     parse_commodity_table_excel,
-    #     parse_region_table_excel,
-    #     parse_commodity_country_export,
-    #     parse_commodity_country_import
-    # )
-
     # 构建当前年度数据目录路径
     year_data_dir = DOWNLOAD_DIR / "total" / str(current_year)
 
@@ -92,5 +84,68 @@ def perform_data_cleanup_and_import(current_year):
     except Exception as e:
         log.error(f"数据清洗失败: {str(e)}")
         raise
-if __name__ == "__main__":
-    perform_data_cleanup_and_import(2025)
+    finally:
+        log.info("更新省市同比数据!")
+        db =DBHelper()
+        db.update_prov_yoy("河南省")
+
+def process_all_region_tables():
+    """
+    按年份倒序处理(如:2025 -> 2024 -> 2023),每个月份也按倒序处理,
+    解析所有'收发货人所在地表'文件。
+    """
+    # 当前年份开始,倒序到2023
+    for year in range(datetime.now().year, 2022, -1):  # 2025 -> 2024 -> 2023
+        year_data_dir = DOWNLOAD_DIR / "total" / str(year)
+        if not year_data_dir.exists():
+            log.warning(f"{year_data_dir} 目录不存在,跳过该年份")
+            continue
+
+        log.info(f"\n开始处理 {year} 年的收发货人所在地表...")
+
+        try:
+            # 获取所有月份子目录(如 01月、02月)
+            month_dirs = [
+                d for d in os.listdir(year_data_dir)
+                if re.match(r'^\d{2}$', d)
+            ]
+
+            if not month_dirs:
+                log.warning(f"{year_data_dir} 下未找到任何月份目录,跳过该年份")
+                continue
+
+            # 按月份倒序排序(12月优先)
+            sorted_months = sorted(month_dirs, reverse=True)
+
+            for month in sorted_months:
+                month_path = Path(year_data_dir) / month
+                log.info(f"正在处理月份目录: {month_path}")
+
+                # 遍历该月份目录下的所有 Excel 文件
+                for file in os.listdir(month_path):
+                    full_path = month_path / file
+
+                    if not file.endswith(('.xls', '.xlsx')):
+                        continue
+
+                    if '(8)' in file:
+                        log.info(f"处理收发货人所在地表: {file}")
+                        parse_region_table_excel(full_path)
+
+            log.info(f"{year} 年的数据处理完成!")
+
+        except Exception as e:
+            log.error(f"{year} 年数据处理失败: {str(e)}")
+        finally:
+            log.info("更新省市同比数据!")
+            db = DBHelper()
+            db.update_prov_yoy("河南省")
+
+
+# if __name__ == "__main__":
+#     process_all_region_tables()
+
+
+
+# if __name__ == "__main__":
+#     perform_data_cleanup_and_import(2025)

+ 66 - 24
crossborder/quanguo/parse_region_table_excel.py

@@ -22,10 +22,11 @@ def parse_region_table_excel(file_path):
         print(f"文件读取失败:{file_path}\n错误:{str(e)}")
         return 0
 
-    params = []
+    params_old = []  # 老逻辑的数据(非地级市)
+    params_new = []  # 新逻辑的地级市数据
 
     # SQL模板使用命名占位符
-    sql_template = """
+    sql_template_old = """
                    INSERT INTO `t_yujin_crossborder_region_trade`
                    (`year_month`, `region_code`, `region_name`, `region_type`,
                     `monthly_total`, `monthly_export`, `monthly_import`,
@@ -46,6 +47,22 @@ def parse_region_table_excel(file_path):
                    ,create_time = NOW()
                    """
 
+    # SQL模板2:新表 - t_yujin_crossborder_prov_region_trade
+    sql_template_new = """
+        INSERT INTO `t_yujin_crossborder_prov_region_trade`
+        (`crossborder_year`, `crossborder_year_month`, `prov_code`, `prov_name`,
+         `city_code`, `city_name`, `monthly_total`, `monthly_import`, `monthly_export`)
+        VALUES 
+        (:crossborder_year, :crossborder_year_month, :prov_code, :prov_name,
+         :city_code, :city_name, :monthly_total, :monthly_import, :monthly_export)
+        ON DUPLICATE KEY UPDATE
+            monthly_total = VALUES(monthly_total),
+            monthly_import = VALUES(monthly_import),
+            monthly_export = VALUES(monthly_export),
+            create_time = NOW()
+    """
+
+
     # 从第7行开始读取(索引6)
     for row_idx in range(6, sheet.nrows):
         try:
@@ -68,20 +85,39 @@ def parse_region_table_excel(file_path):
                     print(f"⚠️ 地区匹配失败:{region_name}")
                     continue
 
-            # 构建参数字典
-            param_dict = {
-                "year_month": year_month,
-                "region_code": region_info['code'],  # region_code
-                "region_name": region_name,
-                "region_type": region_info['type'],
-                "monthly_total": parse_value(row[2]),  # monthly_total
-                "monthly_import": parse_value(row[4]),  # monthly_import
-                "monthly_export": parse_value(row[6]),  # monthly_export
-                "ytd_total": parse_value(row[8]),  # ytd_total
-                "ytd_import": parse_value(row[9]),  # ytd_import
-                "ytd_export": parse_value(row[10])  # ytd_export
-            }
-            params.append(param_dict)
+            is_municipality = region_name in ["北京市","天津市","上海市","重庆市"]
+            is_city = region_name.endswith("市") and not is_municipality
+
+
+            if is_city:
+                # ✅ 地级市(非直辖市),写入新表
+                param_dict_new = {
+                    "crossborder_year": year,
+                    "crossborder_year_month": year_month,
+                    "prov_code": region_info['province_code'],
+                    "prov_name": region_info['province_name'],
+                    "city_code": region_info['code'],
+                    "city_name": region_name,
+                    "monthly_total": parse_value(row[2]),
+                    "monthly_import": parse_value(row[6]),
+                    "monthly_export": parse_value(row[4]),
+                }
+                params_new.append(param_dict_new)
+            else:
+                # ❌ 非地级市(含省、自治区、直辖市等),写入老表
+                param_dict_old = {
+                    "year_month": year_month,
+                    "region_code": region_info['code'],
+                    "region_name": region_name,
+                    "region_type": region_info['type'],
+                    "monthly_total": parse_value(row[2]),
+                    "monthly_import": parse_value(row[6]),
+                    "monthly_export": parse_value(row[4]),
+                    "ytd_total": parse_value(row[8]),
+                    "ytd_import": parse_value(row[10]),
+                    "ytd_export": parse_value(row[9]),
+                }
+                params_old.append(param_dict_old)
 
         except Exception as e:
             print(f"行{row_idx}处理失败:{str(e)}")
@@ -89,15 +125,21 @@ def parse_region_table_excel(file_path):
 
     # 使用DBHelper执行批量SQL
     try:
-        if params:
-            row_count = db.execute_sql_with_params(sql_template, params)
-            print(f"✅ 成功处理 {len(params)} 条记录,插入/更新 {row_count} 行")
-            return row_count
-        print(f"⚠️ 未找到有效数据:{file_path}")
-        return 0
+        if params_old:
+            row_count = db.execute_sql_with_params(sql_template_old, params_old)
+            print(f"✅ 成功处理 {len(params_old)} 条记录(老表),插入/更新 {row_count} 行")
     except Exception as e:
-        print(f"数据库操作失败:{str(e)}")
-        return 0
+        print(f"数据库操作失败(老表):{str(e)}")
+
+    # 执行新表插入
+    try:
+        if params_new:
+            row_count = db.execute_sql_with_params(sql_template_new, params_new)
+            print(f"✅ 成功处理 {len(params_new)} 条记录(新表),插入/更新 {row_count} 行")
+    except Exception as e:
+        print(f"数据库操作失败(新表):{str(e)}")
+
+    return len(params_old) + len(params_new)
 
 
 if __name__ == "__main__":

+ 2 - 1
crossborder/quanguo/selenium_download.py

@@ -13,6 +13,7 @@ from selenium.webdriver import FirefoxOptions, ActionChains
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
+from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import
 from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel
 from crossborder.quanguo.parse_month_excel import parse_month_table_excel
 from crossborder.quanguo.parse_year_excel import parse_year_table_excel
@@ -179,4 +180,4 @@ if __name__ == "__main__":
         driver.quit()
         log.info("【海关总署】全年数据抓取结束".center(66, "*"))
         log.info("\n数据清洗入库中...")
-        # perform_data_cleanup_and_import(current_year)
+        perform_data_cleanup_and_import(current_year)

+ 1 - 4
crossborder/shandong/shandong_parse_excel.py

@@ -5,8 +5,7 @@ import numpy as np
 import pandas as pd
 
 from crossborder.utils.db_helper import DBHelper
-from crossborder.quanguo.CountryTrade import COUNTRY_CODE_MAPPING
-from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.constants import DOWNLOAD_DIR, COUNTRY_CODE_MAPPING
 from crossborder.utils.log import log
 from crossborder.utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, \
     extract_year_month_from_path, get_previous_month_dir, find_unmatched_countries, traverse_and_process
@@ -78,8 +77,6 @@ def process_combined_trade(current_dir, year, month, previous_dir=None):
         prev_export = next(Path(previous_dir).glob("*出口20位主要商品总值*"), None)
         if prev_import and prev_export:
             prev_data = read_trade_pair(prev_import, prev_export)
-    if prev_data.empty:
-        raise FileNotFoundError("缺少上个月数据")
     # 计算逻辑优化
     merged_data = current_data if month == 1 else calculate_monthly_values(current_data, prev_data)
 

+ 14 - 9
crossborder/utils/constants.py

@@ -2,15 +2,15 @@ import os
 import sys
 from pathlib import Path
 
-# PROJECT_ROOT = Path(os.path.abspath(os.path.dirname(__file__))).parent.parent
+PROJECT_ROOT = Path(os.path.abspath(os.path.dirname(__file__))).parent.parent
 
-# if sys.platform.startswith('linux'):
-#     # Linux环境指定为/home目录
-#     DOWNLOAD_DIR = Path('/home/downloads')
-# else:
-#     # Windows保持原有结构(项目根目录下的downloads文件夹)
-#     DOWNLOAD_DIR = PROJECT_ROOT / 'downloads'
-DOWNLOAD_DIR = Path(os.getcwd(), '/downloads')
+if sys.platform.startswith('linux'):
+    # Linux环境指定为/home目录
+    DOWNLOAD_DIR = Path.home() / 'downloads'
+else:
+    # Windows保持原有结构(项目根目录下的downloads文件夹)
+    DOWNLOAD_DIR = PROJECT_ROOT / 'downloads'
+# DOWNLOAD_DIR = Path(os.getcwd(), '/downloads')
 DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)
 
 EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
@@ -341,7 +341,12 @@ REGION_MAPPING = {
 
     # 特别行政区
     "香港特别行政区": {"code": "810000", "type": "province"},
-    "澳门特别行政区": {"code": "820000", "type": "province"}
+    "澳门特别行政区": {"code": "820000", "type": "province"},
+
+
+    #地级市
+    "郑州市": {"code": "410100", "province_code":"410000","province_name":"河南省"},
+    "洛阳市": {"code": "410300", "province_code":"410000","province_name":"河南省"}
 }
 
 GUANGDONG_CITY = {

+ 1 - 0
crossborder/utils/db_helper.py

@@ -217,6 +217,7 @@ class DBHelper:
             log.info(f"{prov_name}新数据更新数: {result.rowcount}")
             return result.rowcount
 
+
     def query(self, sql, params=None, return_df=True):
         """
         执行带参数的SQL语句(支持批量插入/更新)

+ 7 - 3
crossborder/utils/download_utils.py

@@ -107,9 +107,13 @@ def download_excel(driver, url, year, month, title, download_dir):
         # log.info(f"√ 已点击下载按钮:{download_btn.get_attribute("href")}")
 
         downloaded_file  = wait_for_download(download_dir)
-        final_path = Path(f'{download_dir}/{year}/{month:02d}/{title}{downloaded_file.suffix}')
-        if final_path.exists():
-            final_path.unlink()
+        final_dir = Path(f'{download_dir}/{year}/{month:02d}')
+        final_path = Path(f'{final_dir}/{title}{downloaded_file.suffix}')
+        # 删除 final_dir 中所有与 base_name 同名但不同后缀的文件
+        for old_file in final_dir.glob(f'{title}.*'):
+            if old_file.is_file():
+                old_file.unlink()
+                log.info(f"已删除旧文件:{old_file}")
         download_rel_dir = Path(f'{download_dir}/{year}/{month:02d}')
         download_rel_dir.mkdir(parents=True, exist_ok=True)
         downloaded_file.rename(final_path)

+ 1 - 1
crossborder/utils/log.py

@@ -14,7 +14,7 @@ project_root = Path(os.getcwd()).parent.parent
 
 if sys.platform.startswith('linux'):
     # Linux环境指定为/home目录
-    log_dir = Path('/home/logs')
+    log_dir = Path.home() / 'logs'
 else:
     log_dir = project_root / 'logs'
 

+ 2 - 2
crossborder/utils/parse_utils.py

@@ -99,8 +99,8 @@ def find_unmatched_countries(final_df):
 def extract_year_month_from_path(path):
     parts = path.parts
     try:
-        year_part = parts[-3]
-        month_part = parts[-2]
+        year_part = parts[-2]
+        month_part = parts[-1]
         if not YEAR_PATTERN.match(year_part):
             raise ValueError(f"无效年份格式:{year_part}")
         if not MONTH_PATTERN.match(month_part):