Selaa lähdekoodia

1.日志修改
2、山东省数据解析pandans指定engine,防止解析识别识别

01495251 1 kuukausi sitten
vanhempi
commit
147f342848

+ 1 - 1
crossborder/fujian/selenium_fujian_download.py

@@ -94,7 +94,7 @@ def reverse_crawler(driver, target_months):
     # target_months = [(2023, 5), (2023, 4)]
     page = 1
     for year, month in target_months:
-        log.info(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+        log.info(f"开始处理 {year}年{month}月数据".center(50, "="))
 
         WebDriverWait(driver, 15).until(
             EC.presence_of_element_located((By.CLASS_NAME, "conList_ul"))

+ 1 - 1
crossborder/guangdong/selenium_guangdong_download.py

@@ -339,7 +339,7 @@ def reverse_crawler(driver, target_months):
     processed_months = set()
     page = 1
     for year, month in target_months:
-        log.info(f"\n开始处理 {year}年{month}月数据".center(50, "="))
+        log.info(f"开始处理广东海关 {year}年{month}月数据".center(50, "="))
         WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, "conList_ul")))
 
         current_page = 1

+ 1 - 1
crossborder/henan/selenium_henan_download.py

@@ -114,7 +114,7 @@ def process_month_data(driver, year, month):
                     else:
                         log.error(f"{title} 下载已达到最大重试次数,跳过该文件。")
 
-    log.error(f"本页找到{found_count}个有效表格")
+    log.info(f"本页找到{found_count}个有效表格")
     return found_count
 
 

+ 4 - 5
crossborder/quanguo/data_cleaning_to_db.py

@@ -26,7 +26,7 @@ def perform_data_cleanup_and_import(current_year):
     # 构建当前年度数据目录路径
     year_data_dir = DOWNLOAD_DIR / "total" / str(current_year)
 
-    log.info(f"\n数据清洗入库中...")
+    log.info(f"数据清洗入库中...".center(66, "*"))
 
     try:
         # 获取所有月份子目录(如 01月、02月)
@@ -83,15 +83,14 @@ def perform_data_cleanup_and_import(current_year):
             else:
                 log.warning(f"未知类型文件,跳过: {full_path}")
 
-        log.info("数据清洗与入库完成!")
-
     except Exception as e:
         log.error(f"数据清洗失败: {str(e)}")
         raise
     finally:
         log.info("更新省市同比数据!")
-        base_mysql.update_shandong_yoy("河南省")
-        base_mysql.update_shandong_yoy_origin("山东省")
+        for province in provinces:
+            base_mysql.update_shandong_yoy_origin(province)
+        log.info(f"数据清洗入库完成".center(66, "*"))
 
 def main():
     """

+ 8 - 8
crossborder/quanguo/selenium_download.py

@@ -24,7 +24,7 @@ log = get_logger(__name__)
 base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
 download_dir = DOWNLOAD_DIR / "total"
 downloaded_tables = set()  # 已下载的表格名集合
-
+data_collected = False  # 是否有数据被采集
 
 
 def generate_table_title(year):
@@ -118,7 +118,8 @@ def go_to_year_page(driver, year):
         return False
 
 
-def crawl_with_selenium(driver, year, latest_only=False,data_collected=False):
+def crawl_with_selenium(driver, year, latest_only=False):
+    global data_collected
     """主抓取函数"""
     if year < datetime.now().year:
         if not go_to_year_page(driver, year):
@@ -195,7 +196,7 @@ def handle_month_data(driver, table_name, month_links, year):
                # 下载成功后将表格名加入集合
                downloaded_tables.add(table_name)
             except Exception as e:
-               log.info(f"【异常】下载失败: {str(e)}")
+               log.error(f"【异常】{year}-{month_num:02d} {table_name}: {str(e)}")
             time.sleep(random.uniform(0.5, 1.5))  # 下载间隔
 def  main():
 
@@ -206,7 +207,6 @@ def  main():
 
     start_time = time.time()
     years_processed = []  # 记录成功处理的年份
-    data_collected = False  # 是否有数据被采集
     driver =  None
 
     try:
@@ -236,12 +236,12 @@ def  main():
         # 4. 年份遍历采集
         for year in years_to_crawl:
             try:
-                log.info(f"\n【{year}年】开始处理".center(66, "-"))
+                log.info(f"【{year}年】开始处理".center(66, "-"))
 
                 is_latest_only = (not args.year) and (year == current_year)
 
                 # 执行年份采集
-                crawl_with_selenium(driver, year=year, latest_only=is_latest_only,data_collected =  data_collected)
+                crawl_with_selenium(driver, year=year, latest_only=is_latest_only)
                 years_processed.append(year)
 
                 log.info(f"【{year}年】处理完成".center(66, "-"))
@@ -263,11 +263,11 @@ def  main():
 
         # 6. 数据清洗入库(仅当有数据被采集时)
         if data_collected:
-            log.info("\n【海关总署】数据清洗入库开始".center(66, "*"))
+            log.info("【海关总署】数据清洗入库开始".center(66, "*"))
 
             try:
                 log.info("数据清洗入库中...")
-                perform_data_cleanup_and_import(years_processed)
+                perform_data_cleanup_and_import(current_year)
                 log.info("数据清洗入库完毕")
 
             except Exception as e:

+ 3 - 3
crossborder/shandong/selenium_shandong_download.py

@@ -212,15 +212,15 @@ def main():
         data_collected = True
 
         # 6. 数据清洗入库
-        log.info("\n【山东海关】数据清洗入库中...")
+        log.info("【山东海关】数据清洗入库中...")
         traverse_and_process(download_dir, parse_excel,
                              province_name="shandong",
                              year=args.year)
 
         # 7. 同比数据更新
-        log.info("\n【山东海关】地级市数据同比更新中...")
+        log.info("【山东海关】地级市数据同比更新中...")
         db.update_prov_yoy("山东省")
-        log.info("\n【山东海关】同比更新完成")
+        log.info("【山东海关】同比更新完成")
 
     except Exception as e:
         # 捕获并记录所有异常

+ 4 - 2
crossborder/shandong/shandong_parse_excel.py

@@ -292,12 +292,14 @@ def read_with_header4(file_path):
 
 def read_trade_pair(import_path, export_path):
     """进/出口表格合并"""
-    df_import = pd.read_excel(import_path, skiprows=3, skipfooter=1,
+    # 显式指定引擎,防止格式识别失败
+    engine = 'openpyxl' if str(import_path).endswith('.xlsx') else 'xlrd'
+    df_import = pd.read_excel(import_path, skiprows=3, skipfooter=1, engine=engine,
                               usecols=[0, 1], names=["commodity_name", "monthly_import"]).pipe(lambda df: df.assign(
         commodity_name=df["commodity_name"].apply(clean_commodity_name)
     ))
 
-    df_export = pd.read_excel(export_path, skiprows=3, skipfooter=1,
+    df_export = pd.read_excel(export_path, skiprows=3, skipfooter=1, engine=engine,
                               usecols=[0, 1], names=["commodity_name", "monthly_export"]).pipe(lambda df: df.assign(
         commodity_name=df["commodity_name"].apply(clean_commodity_name)
     ))

+ 1 - 1
crossborder/utils/download_utils.py

@@ -265,7 +265,7 @@ def download_excel2(driver, link, year, month, title, download_dir):
         log.info(f"√ 文件已保存至:{final_path}")
 
     except TimeoutError as te:
-        log.info(f"[错误] 文件下载超时:{te}")
+        log.info(f"[错误] 文件 {title} 下载超时:{te}")
         raise
     except Exception as e:
         log.info(f"[错误] 发生异常:{e}")

+ 3 - 3
crossborder/utils/parse_utils.py

@@ -21,7 +21,7 @@ def parse_value(val):
             return Decimal(val).quantize(Decimal('0.0000'))  # 用Decimal处理科学计数法,确保四位小数
         return Decimal(str(val).replace(',', '')).quantize(Decimal('0.0000'))  # 保留四位小数
     except Exception as e:
-        print(f"数值解析错误:{val},错误:{e}")
+        log.error(f"数值解析错误:{val},错误:{e}")
         return None
 
 def convert_unit(value):
@@ -93,11 +93,11 @@ def find_unmatched_countries(final_df):
         unmatched_names = final_df.loc[unmatched_mask, 'country_name'].unique()
 
         # 输出警告信息
-        log.info("⚠️ 以下国家名称未在 COUNTRY_CODE_MAPPING 中找到匹配:")
+        log.warning("⚠️ 以下国家名称未在 COUNTRY_CODE_MAPPING 中找到匹配:")
 
         # 打印所有未匹配的国家名称,按字母排序
         for name in sorted(unmatched_names):
-            log.info(f"   - {name}")
+            log.warning(f"   - {name}")
 
 def extract_year_month_from_path(path):
     """