Przeglądaj źródła

广东省分地市清洗数据缺失月份数据逻辑兼容-深圳海关、湛江海关

01495251 1 miesiąc temu
rodzic
commit
85bc7103e3

+ 1 - 1
crossborder/fujian/fujian_parse_excel.py

@@ -302,5 +302,5 @@ if __name__ == "__main__":
     traverse_and_process(download_dir, parse_excel, province_name="fujian")
     print("更新同比数据……")
     db_helper = DBHelper()
-    db_helper.update_january_yoy()
+    db_helper.update_prov_yoy("福建省")
     # parse_excel(download_dir/"2023"/"02")

+ 212 - 80
crossborder/guangdong/guangdong_gongbei_parse_excel.py

@@ -1,3 +1,5 @@
+import re
+
 import pandas as pd
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
@@ -14,11 +16,22 @@ PROV_NAME = "广东省"
 
 db = DBHelper()
 
+"""
+2023年9月 中山市数据,缺少城市数据列,需要特殊兼容
+个别月份数据,表头行数不一致,因此这里起始行数据,需要动态解析
+"""
+
+
+def parse_page_region_data(driver, url, year, month, title):
+    """第一阶段:按原始逻辑入库,增加标题中提取城市功能"""
+    # 先从标题中提取城市名称
+    page_city = extract_city_from_title(title)
 
-def parse_region_table(driver, url, year, month, title):
-    """第一阶段:按原始逻辑入库(1月是单月,其他月份是1-X月累计)"""
-    log.info(f"开始解析{PROV_NAME} {year}年{month}月 {title}")
-    data = parse_page_data(driver, url, year, month)
+    if not page_city:
+        log.warning(f"标题中未识别到城市: {title}")
+
+    log.info(f"开始解析{PROV_NAME} {year}年{month}月 {title} ({page_city if page_city else '城市未知'})")
+    data = parse_page_data(driver, url, year, month, title, page_city)
 
     if data:
         df = pd.DataFrame(data)
@@ -32,6 +45,201 @@ def parse_region_table(driver, url, year, month, title):
         log.info(f"{PROV_NAME} {year}年{month}月 {title}数据解析完成")
 
 
+def extract_city_from_title(title):
+    """从标题中提取城市名称"""
+    city_pattern = r"(中山市|珠海市)"
+    match = re.search(city_pattern, title)
+    if match:
+        return match.group(1)
+    return None
+
+
+def parse_page_data(driver, url, year, month, title, page_city=None):
+    """解析页面数据,使用标题中的城市信息"""
+    data = []
+    try:
+        # 如果未传入城市名称,尝试从标题中提取
+        if not page_city:
+            page_city = extract_city_from_title(title)
+
+        log.info(f"当前采集数据页面:{url} [城市: {page_city if page_city else '未知'}]")
+        driver.execute_script(f"window.open('{url}')")
+        driver.switch_to.window(driver.window_handles[-1])
+
+        table_xpath = '//table[contains(@style, "BORDER-COLLAPSE: collapse")]'
+
+        WebDriverWait(driver, 60).until(
+            EC.presence_of_element_located(
+                (By.XPATH, table_xpath))
+        )
+
+        table = driver.find_element(By.XPATH, table_xpath)
+
+        if not table:
+            log.warning("未找到表格元素")
+            return data
+
+        rows = table.find_elements(By.TAG_NAME, 'tr')
+        if len(rows) < 4:
+            log.warning("表格行数不足")
+            return data
+
+            # 智能识别表头行
+        data_start_row = find_data_start_row(rows)
+        if data_start_row < 0:
+            log.warning("未找到数据起始行")
+            return data
+
+        # 定义基准列映射
+        BASE_COLUMN_MAPPING = {
+            'monthly_total': 1,
+            'yoy_import_export': 2,
+            'monthly_export': 5,
+            'yoy_export': 6,
+            'monthly_import': 9,
+            'yoy_import': 10
+        }
+
+        for row in rows[data_start_row:]:
+            cols = [td.text.strip() for td in row.find_elements(By.TAG_NAME, 'td')]
+
+            # 确定城市名称:
+            # 1. 优先从行中查找
+            # 2. 使用页面级城市名称 (从标题获取)
+            city_name = find_city_in_row(cols)
+
+            if not city_name and page_city:
+                # 如果行内找不到城市但标题中有城市信息,使用标题中的城市
+                city_name = page_city
+
+            if not city_name:
+                log.debug("无法识别城市名称,跳过此行")
+                continue
+
+            # 确定城市代码
+            city_code = GUANGDONG_CITY.get(city_name)
+            if not city_code:
+                log.debug(f"跳过未识别的城市: {city_name}")
+                continue
+
+            # 确定列映射
+            column_mapping = detect_column_mapping(cols, BASE_COLUMN_MAPPING, city_name)
+
+            try:
+                # 从映射的列中提取数据
+                monthly_total = get_value_safely(cols, column_mapping.get('monthly_total'))
+                monthly_import = get_value_safely(cols, column_mapping.get('monthly_import'))
+                monthly_export = get_value_safely(cols, column_mapping.get('monthly_export'))
+                yoy_import_export = get_value_safely(cols, column_mapping.get('yoy_import_export'))
+                yoy_export = get_value_safely(cols, column_mapping.get('yoy_export'))
+                yoy_import = get_value_safely(cols, column_mapping.get('yoy_import'))
+
+                item = {
+                    'city_name': city_name,
+                    'city_code': city_code,
+                    'crossborder_year': year,
+                    'crossborder_year_month': f"{year}-{month:02d}",
+                    'prov_code': PROV_CODE,
+                    'prov_name': PROV_NAME,
+                    'monthly_total': parse_value(monthly_total),
+                    'monthly_import': parse_value(monthly_import),
+                    'monthly_export': parse_value(monthly_export),
+                    'yoy_import_export': parse_value(yoy_import_export),
+                    'yoy_export': parse_value(yoy_export),
+                    'yoy_import': parse_value(yoy_import)
+                }
+
+                data.append(item)
+
+            except Exception as e:
+                log.error(f"解析行数据失败: {str(e)}")
+                continue
+
+    except Exception as e:
+        log.error(f"解析页面失败:{str(e)}")
+        raise
+    finally:
+        driver.close()
+        driver.switch_to.window(driver.window_handles[0])
+
+    return data
+
+
+def find_city_in_row(cols):
+    """在表格行的列中查找城市名称"""
+    # 检查列中是否直接包含城市名称
+    for col in cols:
+        if "中山市" in col:
+            return "中山市"
+        if "珠海市" in col:
+            return "珠海市"
+
+    # 检查是否有类似"中山"或"珠海"的缩写
+    for col in cols:
+        if re.search(r"^中山$", col):
+            return "中山市"
+        if re.search(r"^珠海$", col):
+            return "珠海市"
+
+    return None
+
+
+def detect_column_mapping(cols, base_mapping, city_name):
+    """检测列映射关系,考虑城市名称位置"""
+    # 首先尝试查找城市名称在行中的位置
+    city_index = None
+    for i, col in enumerate(cols):
+        if city_name in col:
+            city_index = i
+            break
+
+    if city_index is None:
+        # 如果没有找到城市名称,使用基准映射
+        return base_mapping
+
+    if city_index == 0:
+        # 城市在第0列 - 基准情况
+        return base_mapping
+    else:
+        # 城市在其他列 - 创建偏移映射
+        offset = city_index  # 因为基准映射中city_name在第0列
+        return {
+            'monthly_total': base_mapping['monthly_total'] + offset - 1,
+            'yoy_import_export': base_mapping['yoy_import_export'] + offset - 1,
+            'monthly_export': base_mapping['monthly_export'] + offset - 1,
+            'yoy_export': base_mapping['yoy_export'] + offset - 1,
+            'monthly_import': base_mapping['monthly_import'] + offset - 1,
+            'yoy_import': base_mapping['yoy_import'] + offset - 1
+        }
+
+
+def find_data_start_row(rows):
+    """智能识别数据起始行"""
+    # 定义关键词用于识别表头行
+    header_keywords = ["人民币"]
+
+    # 检查表头特征行
+    for i, row in enumerate(rows):
+        # 获取行文本
+        row_text = "".join([td.text.strip() for td in row.find_elements(By.TAG_NAME, 'td')])
+
+        # 检查是否包含关键词
+        if any(keyword in row_text for keyword in header_keywords):
+            log.debug(f"在行 {i + 1} 找到表头行: {row_text}")
+            # 返回下一行作为数据起始行
+            if i + 1 < len(rows):
+                return i + 1
+    log.warning("无法识别数据起始行")
+    return -1
+def get_value_safely(cols, index):
+    """安全获取列值,避免索引超出范围"""
+    if index is None or not isinstance(index, int):
+        return None
+
+    if 0 <= index < len(cols):
+        return cols[index]
+    return None
+
 def calculate_monthly_data(year, month):
     """第二阶段:计算并更新单月数据(适用于非1月)"""
     if month == 1:
@@ -105,79 +313,3 @@ def calculate_monthly_data(year, month):
     log.info(f"{PROV_NAME} {year}年{month}月单月数据计算完成")
 
 
-def parse_page_data(driver, url, year, month):
-    """解析页面数据(保持不变)"""
-    data = []
-    try:
-        log.info(f"当前采集数据页面:{url}")
-        driver.execute_script(f"window.open('{url}')")
-        driver.switch_to.window(driver.window_handles[-1])
-
-        table_xpath = '//table[contains(@style, "BORDER-COLLAPSE: collapse")]'
-
-        WebDriverWait(driver, 60).until(
-            EC.presence_of_element_located(
-                (By.XPATH, table_xpath))
-        )
-
-        table = driver.find_element(By.XPATH, table_xpath)
-
-        if not table:
-            log.warning("未找到表格元素")
-            return data
-
-        rows = table.find_elements(By.TAG_NAME, 'tr')
-        if len(rows) < 4:
-            log.warning("表格行数不足")
-            return data
-
-        COLUMN_MAPPING = {
-            'city_name': 0,
-            'monthly_total': 1,
-            'yoy_import_export': 2,
-            'monthly_export': 5,
-            'yoy_export': 6,
-            'monthly_import': 9,
-            'yoy_import': 10
-        }
-
-        for row in rows[3:]:
-            cols = [td.text.strip() for td in row.find_elements(By.TAG_NAME, 'td')]
-
-            try:
-                city_name = cols[COLUMN_MAPPING['city_name']]
-                city_name = city_name.replace("广东省", "")
-                city_code = GUANGDONG_CITY.get(city_name)
-                if not city_code:
-                    log.debug(f"跳过未识别的城市: {city_name}")
-                    continue
-
-                item = {
-                    'city_name': city_name,
-                    'city_code': city_code,
-                    'crossborder_year': year,
-                    'crossborder_year_month': f"{year}-{month:02d}",
-                    'prov_code': PROV_CODE,
-                    'prov_name': PROV_NAME,
-                    'monthly_total': parse_value(cols[COLUMN_MAPPING['monthly_total']]),
-                    'monthly_import': parse_value(cols[COLUMN_MAPPING['monthly_import']]),
-                    'monthly_export': parse_value(cols[COLUMN_MAPPING['monthly_export']]),
-                    'yoy_import_export': parse_value(cols[COLUMN_MAPPING['yoy_import_export']]),
-                    'yoy_export': parse_value(cols[COLUMN_MAPPING['yoy_export']]),
-                    'yoy_import': parse_value(cols[COLUMN_MAPPING['yoy_import']])
-                }
-
-                data.append(item)
-
-            except Exception as e:
-                log.error(f"解析行数据失败: {str(e)}")
-                continue
-
-    except Exception as e:
-        log.error(f"解析页面失败:{str(e)}")
-        raise
-    finally:
-        driver.close()
-        driver.switch_to.window(driver.window_handles[0])
-
-    return data

+ 47 - 48
crossborder/guangdong/guangdong_sub_customs_parse_excel.py

@@ -34,7 +34,7 @@ def match_customs_file(filename, customs_name, year, month):
     if customs_name == "广州海关":
         return "7地市进出口综合统计" in filename_lower
     elif customs_name == "深圳海关":
-        return "深圳海关综合统计资料" in filename_lower
+        return "深圳关区综合统计资料" in filename_lower or "深圳海关综合统计资料" in filename_lower
     elif customs_name == "汕头海关":
         return "5市报表" in filename_lower
     elif customs_name == "黄埔海关":
@@ -44,7 +44,7 @@ def match_customs_file(filename, customs_name, year, month):
             return "外贸进出口有关情况统计表" in filename_lower
     elif customs_name == "湛江海关":
         if "湛江市" in filename_lower or "茂名市" in filename_lower:
-            return "外贸进出口数据" in filename_lower
+            return "进出口数据" in filename_lower
     return False
 
 
@@ -130,37 +130,36 @@ def process_guangzhou_customs(file_path, year, month,customs_type='guangzhou'):
         for idx in range(header_row + 1, len(df)):
             row = df.iloc[idx]
             city_cell = str(row[0])
-            if "广东省" in city_cell:
-                city_name = city_cell.replace("广东省", "").strip()
-                if city_name in target_cities:
-                    try:
-                        if len(data_cols)>3:
-                            monthly_total = Decimal(str(row[data_cols[0]]))  # 进出口
-                            monthly_export = Decimal(str(row[data_cols[4]]))  # 出口
-                            monthly_import = Decimal(str(row[data_cols[8]]))  # 进口
-                            yoy_import_export = Decimal(str(row[data_cols[1]]))  # 进出口同比
-                            yoy_export = Decimal(str(row[data_cols[5]]))  # 出口同比
-                            yoy_import = Decimal(str(row[data_cols[9]]))  # 进口同比
-                        else:
-                            monthly_total = Decimal(str(row[data_cols[0]]))
-                            monthly_export = Decimal(str(row[data_cols[1]]))
-                            monthly_import = Decimal(str(row[data_cols[2]]))
-                            yoy_import_export = Decimal(str(row[data_cols[0]+1]))  # 进出口同比
-                            yoy_export = Decimal(str(row[data_cols[1]+1]))  # 出口同比
-                            yoy_import = Decimal(str(row[data_cols[2]+1]))  # 进口同比
-
-
-                        results.append({
-                            "city_name": city_name,
-                            "monthly_total": monthly_total,
-                            "monthly_import": monthly_import,
-                            "monthly_export": monthly_export,
-                            "yoy_import_export": yoy_import_export,
-                            "yoy_import": yoy_import,
-                            "yoy_export": yoy_export
-                        })
-                    except Exception as e:
-                        log.error(f"处理行 {idx} 出错: {e}")
+            city_name = city_cell.replace("广东省", "").strip()
+            if city_name in target_cities:
+                try:
+                    if len(data_cols)>3:
+                        monthly_total = Decimal(str(row[data_cols[0]]))  # 进出口
+                        yoy_import_export = Decimal(str(row[data_cols[1]]))  # 进出口同比
+                        yoy_export = Decimal(str(row[data_cols[5]]))  # 出口同比
+                        monthly_export = Decimal(str(row[data_cols[4]]))  # 出口
+                        monthly_import = Decimal(str(row[data_cols[8]]))  # 进口
+                        yoy_import = Decimal(str(row[data_cols[9]]))  # 进口同比
+                    else:
+                        monthly_total = Decimal(str(row[data_cols[0]]))
+                        monthly_export = Decimal(str(row[data_cols[1]]))
+                        monthly_import = Decimal(str(row[data_cols[2]]))
+                        yoy_import_export = Decimal(str(row[data_cols[0]+1]))  # 进出口同比
+                        yoy_export = Decimal(str(row[data_cols[1]+1]))  # 出口同比
+                        yoy_import = Decimal(str(row[data_cols[2]+1]))  # 进口同比
+
+
+                    results.append({
+                        "city_name": city_name,
+                        "monthly_total": monthly_total,
+                        "monthly_import": monthly_import,
+                        "monthly_export": monthly_export,
+                        "yoy_import_export": yoy_import_export,
+                        "yoy_import": yoy_import,
+                        "yoy_export": yoy_export
+                    })
+                except Exception as e:
+                    log.error(f"处理行 {idx} 出错: {e}")
 
         return pd.DataFrame(results)
 
@@ -438,21 +437,21 @@ def process_zhanjiang_customs(file_path, year, month):
         # 从文件名确定城市
         city_name = "湛江市" if "湛江市" in file_path.name else "茂名市"
         # 查找月度数据表格
-        month_str = f"{year}年前{month}个月{city_name}进出口数据(月度)"
-
-        # target_header_row = None
-        #
-        # # 查找表头行
-        # for i in range(min(3, len(df))):  # 在前5行找表头
-        #     if any(month_str in str(cell) for cell in df.iloc[i]):
-        #         target_header_row = i
-        #         break
-        #
-        # if target_header_row is None:
-        #     log.error(f"未找到 {month_str} 表头")
-        #     return pd.DataFrame()
-
-        target_header_row =1
+        month_str = f"{city_name}进出口数据"
+
+        target_header_row = None
+
+        # 查找表头行
+        for i in range(min(3, len(df))):  # 在前5行找表头
+            if any(month_str in str(cell) for cell in df.iloc[i]):
+                target_header_row = i
+                break
+
+        if target_header_row is None:
+            log.error(f"未找到 {month_str} 表头")
+            return pd.DataFrame()
+
+        # target_header_row =1
 
         # 确定数据列位置
         data_cols = {}

+ 3 - 2
crossborder/guangdong/selenium_guangdong_city.py

@@ -10,8 +10,9 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
+from crossborder.guangdong.guangdong_gongbei_parse_excel import parse_page_region_data, calculate_monthly_data
 from crossborder.utils.db_helper import DBHelper
-from crossborder.guangdong.guangdong_gongbei_parse_excel import parse_region_table, calculate_monthly_data
+
 from crossborder.guangdong.guangdong_sub_customs_parse_excel import parse_excel
 from crossborder.utils.constants import DOWNLOAD_DIR
 from crossborder.utils.constants import GUANGDONG_CUSTOMS_URL
@@ -113,7 +114,7 @@ def process_month_data(driver, year, month, customs_name,found_count, max_retrie
                         elif customs_name in ['湛江海关', '广州海关']:
                             batch_download_excel(driver, url, year, month, title, download_dir)
                         elif customs_name == "拱北海关":
-                            parse_region_table(driver, url, year, month, title)
+                            parse_page_region_data(driver, url, year, month,  title)
                         else:
                             download_excel(driver, url, year, month, title, download_dir)
                         found_count += 1

+ 5 - 1
crossborder/guangdong/selenium_guangdong_download.py

@@ -304,7 +304,11 @@ def merge_commodity_data(import_data, export_data, year, month):
     )
 
     # 计算总量(可选,根据表结构需求)
-    merged_df['monthly_total'] = merged_df['monthly_import'] + merged_df['monthly_export']
+
+    #进出口总值计算
+    merged_df['monthly_total'] = merged_df['monthly_import'].fillna(0) + merged_df['monthly_export'].fillna(0)
+    merged_df['monthly_total'] = merged_df['monthly_total'].replace(0, np.nan)
+
     merged_df['crossborder_year'] = year
     merged_df['crossborder_year_month'] = f"{year}-{month:02d}"
     merged_df['prov_code'] = PROV_CODE

+ 25 - 14
crossborder/henan/henan_parse_excel.py

@@ -2,6 +2,7 @@ import argparse
 import re
 from pathlib import Path
 
+import numpy as np
 import pandas as pd
 
 from crossborder.utils.db_helper import DBHelper
@@ -53,7 +54,14 @@ def process_combined_trade(current_dir, year, month):
 
     db = DBHelper()
 
+    invalid_names = ['', '--', '不详', '未知']
+    current_data = current_data[
+        current_data['commodity_name'].notna() &
+        ~current_data['commodity_name'].isin(invalid_names)
+        ].copy()
+
     current_data['commodity_code'] = current_data['commodity_name'].apply(db.get_commodity_id)
+
     valid_data = current_data[current_data['commodity_code'].notnull()].copy()
 
     # 构建当前月数据
@@ -61,14 +69,16 @@ def process_combined_trade(current_dir, year, month):
     valid_data['crossborder_year_month'] = f"{year}-{month:02d}"
     valid_data['prov_code'] = PROV_CODE
     valid_data['prov_name'] = PROV_NAME
-    valid_data['monthly_total'] = valid_data['monthly_import'] + valid_data['monthly_export']
+    #进出口总值计算
+    valid_data['monthly_total'] = valid_data['monthly_import'].fillna(0) + valid_data['monthly_export'].fillna(0)
+    valid_data['monthly_total'] = valid_data['monthly_total'].replace(0, np.nan)
 
     # 定义目标字段
     target_cols = [
         'crossborder_year', 'crossborder_year_month', 'prov_code', 'prov_name',
         'commodity_code', 'commodity_name', 'monthly_total', 'monthly_import', 'monthly_export'
     ]
-
+    valid_data = valid_data.replace({np.nan: None})
     # 写入当前月数据
     db.bulk_insert(
         valid_data[target_cols],
@@ -87,6 +97,7 @@ def process_combined_trade(current_dir, year, month):
         numeric_cols = ['monthly_total', 'monthly_import', 'monthly_export']
         january_data[numeric_cols] = january_data[numeric_cols] / 2
 
+        january_data = january_data.replace({np.nan: None})
         # 写入模拟1月数据
         db.bulk_insert(
             january_data[target_cols],
@@ -119,6 +130,8 @@ def process_country_trade(current_file_path, year, month):
     final_df['prov_code'] = PROV_CODE
     final_df['prov_name'] = PROV_NAME
 
+    final_df = final_df.replace({np.nan: None})
+
     # 主数据写入
     db = DBHelper()
 
@@ -143,6 +156,8 @@ def process_country_trade(current_file_path, year, month):
         yoy_cols = ['yoy_import_export', 'yoy_import', 'yoy_export']
         january_df[yoy_cols] = 0.0  # 模拟数据无同比
 
+        january_df = january_df.replace({np.nan: None})
+
         # 模拟数据写入(增加注释说明)
         db.bulk_insert(
             january_df,
@@ -207,10 +222,6 @@ def read_with_header4(file_path, month):
         })
         # 阶段2:安全类型转换
         .apply(pd.to_numeric, errors='coerce', downcast='float')
-        # 阶段3:空值处理
-        .fillna(0)
-        # 阶段4:精度控制
-        .round(2)
     )
 
     return final_df
@@ -242,19 +253,19 @@ def read_trade_pair(import_path, export_path):
         commodity_name=df["commodity_name"].apply(clean_commodity_name)
     ))
 
-    merged = pd.merge(df_import, df_export, on="commodity_name", how="outer").fillna(0)
+    merged = pd.merge(df_import, df_export, on="commodity_name", how="outer")
     merged["monthly_import"] = merged["monthly_import"].apply(parse_value)
     merged["monthly_export"] = merged["monthly_export"].apply(parse_value)
     return merged
 
 
-def calculate_monthly_values(current_data, prev_data):
-    """"""
-    merged = pd.merge(current_data, prev_data, on="commodity_name",
-                      how="left", suffixes=("_current", "_prev")).fillna(0)
-    merged["monthly_import"] = merged["monthly_import_current"] - merged["monthly_import_prev"]
-    merged["monthly_export"] = merged["monthly_export_current"] - merged["monthly_export_prev"]
-    return merged[["commodity_name", "monthly_import", "monthly_export"]]
+# def calculate_monthly_values(current_data, prev_data):
+#     """"""
+#     merged = pd.merge(current_data, prev_data, on="commodity_name",
+#                       how="left", suffixes=("_current", "_prev"))
+#     merged["monthly_import"] = merged["monthly_import_current"] - merged["monthly_import_prev"]
+#     merged["monthly_export"] = merged["monthly_export_current"] - merged["monthly_export_prev"]
+#     return merged[["commodity_name", "monthly_import", "monthly_export"]]
 
 
 # def clean_commodity_name(name):

+ 32 - 29
crossborder/quanguo/data_cleaning_to_db.py

@@ -3,13 +3,15 @@ import re
 from datetime import datetime
 from pathlib import Path
 
+from crossborder.utils import base_mysql
+
 from crossborder.quanguo.parse_commodity_country_detail_excel import parse_commodity_country_detail
 from crossborder.quanguo.parse_commodity_table_excel import parse_commodity_table_excel
 from crossborder.quanguo.parse_country_table_excel import parse_country_table_excel
 from crossborder.quanguo.parse_month_excel import parse_month_table_excel
 from crossborder.quanguo.parse_region_table_excel import parse_region_table_excel
 from crossborder.quanguo.parse_year_excel import parse_year_table_excel
-from crossborder.utils import base_mysql
+from crossborder.utils.base_mysql import provinces
 from crossborder.utils.constants import DOWNLOAD_DIR
 from crossborder.utils.log import log
 
@@ -128,45 +130,46 @@ def main():
                     if not file.endswith(('.xls', '.xlsx')):
                         continue
 
-                    # if '(1)' in file and '年度表' in file:
-                    #     log.info(f"处理年度汇总表: {file}")
-                    #     parse_year_table_excel(full_path)
-                    #
-                    # elif '(1)' in file and '月度表' in file:
-                    #     log.info(f"处理月度汇总表: {file}")
-                    #     parse_month_table_excel(full_path)
-                    #
-                    # elif '(2)' in file:
-                    #     log.info(f"处理国别(地区)贸易表: {file}")
-                    #     parse_country_table_excel(full_path)
-                    #
-                    # elif '(4)' in file:
-                    #     log.info(f"处理类章贸易表: {file}")
-                    #     parse_commodity_table_excel(full_path)
+                    if '(1)' in file and '年度表' in file:
+                        log.info(f"处理年度汇总表: {file}")
+                        parse_year_table_excel(full_path)
+
+                    elif '(1)' in file and '月度表' in file:
+                        log.info(f"处理月度汇总表: {file}")
+                        parse_month_table_excel(full_path)
+
+                    elif '(2)' in file:
+                        log.info(f"处理国别(地区)贸易表: {file}")
+                        parse_country_table_excel(full_path)
+
+                    elif '(4)' in file:
+                        log.info(f"处理类章贸易表: {file}")
+                        parse_commodity_table_excel(full_path)
 
                     elif '(8)' in file:
                         log.info(f"处理收发货人所在地表: {file}")
                         parse_region_table_excel(full_path)
 
-                    # elif '(15)' in file:
-                    #     log.info(f"处理对部分国家(地区)出口类章金额表: {full_path}")
-                    #     parse_commodity_country_detail(full_path, "export")
-                    #
-                    # elif '(16)' in file:
-                    #     log.info(f"处理自部分国家(地区)进口类章金额表: {full_path}")
-                    #     parse_commodity_country_detail(full_path, "import")
-                    #
-                    # else:
-                    #     log.warning(f"未知类型文件,跳过: {full_path}")
+                    elif '(15)' in file:
+                        log.info(f"处理对部分国家(地区)出口类章金额表: {full_path}")
+                        parse_commodity_country_detail(full_path, "export")
+
+                    elif '(16)' in file:
+                        log.info(f"处理自部分国家(地区)进口类章金额表: {full_path}")
+                        parse_commodity_country_detail(full_path, "import")
+
+                    else:
+                        log.warning(f"未知类型文件,跳过: {full_path}")
 
             log.info(f"{year} 年的数据处理完成!")
 
         except Exception as e:
             log.error(f"{year} 年数据处理失败: {str(e)}")
         finally:
-            log.info("更新省市同比数据!")
-            base_mysql.update_shandong_yoy("河南省")
-            base_mysql.update_shandong_yoy_origin("山东省")
+            log.info("更新海关总署省份同比数据!")
+            for province in provinces:
+                base_mysql.update_shandong_yoy_origin(province)
+            log.info("数据更新完成!")
 
 
 if __name__ == "__main__":

+ 1 - 1
crossborder/quanguo/selenium_download.py

@@ -142,7 +142,7 @@ def crawl_with_selenium(driver, year, latest_only=False):
 
             driver.execute_script("arguments[0].remove()", row)
             WebDriverWait(driver, 10).until(EC.staleness_of(row))
-            time.sleep(random.uniform(1, 3))
+            time.sleep(random.uniform(0.5, 1.5))
 
     except StaleElementReferenceException:
         log.error("检测到元素失效,自动刷新表格")

+ 0 - 45
crossborder/utils/db_helper.py

@@ -92,51 +92,6 @@ class DBHelper:
             log.error(f"数据库操作失败: {str(e)}")
             raise
 
-    def update_january_yoy(self, prov_name='福建省'):
-        """
-        更新指定省份1月份同比数据
-        :param prov_name: 省份名称,默认为福建省
-        """
-        update_sql = text("""
-                          UPDATE t_yujin_crossborder_prov_region_trade AS curr
-                              INNER JOIN t_yujin_crossborder_prov_region_trade AS prev
-                          ON curr.city_code = prev.city_code
-                              AND prev.crossborder_year_month = DATE_FORMAT(
-                              DATE_SUB(
-                              STR_TO_DATE(CONCAT(curr.crossborder_year_month, '-01'), '%Y-%m-%d'),
-                              INTERVAL 1 YEAR
-                              ),
-                              '%Y-01'
-                              )
-                              SET
-                                curr.yoy_import_export = COALESCE (
-                                    TRUNCATE((curr.monthly_total - prev.monthly_total) / NULLIF (prev.monthly_total, 0) * 100, 4),
-                                    0.0000
-                                ),
-                                curr.yoy_import = COALESCE (
-                                    TRUNCATE((curr.monthly_import - prev.monthly_import) / NULLIF (prev.monthly_import, 0) * 100, 4),
-                                    0.0000
-                                ),
-                                curr.yoy_export = COALESCE (
-                                    TRUNCATE((curr.monthly_export - prev.monthly_export) / NULLIF (prev.monthly_export, 0) * 100, 4),
-                                    0.0000
-                                )
-                          WHERE
-                              curr.prov_name = :prov_name
-                            AND curr.crossborder_year_month LIKE '%-01'
-                            AND curr.crossborder_year_month
-                              > '2023-01'
-                          """)
-
-        try:
-            with self.engine.begin() as conn:
-                result = conn.execute(update_sql, {'prov_name': prov_name})
-                log.info(f"Updated {result.rowcount} rows for {prov_name}")
-                return result.rowcount
-
-        except Exception as e:
-            log.error(f"Update failed: {str(e)}")
-            raise RuntimeError(f"同比数据更新失败: {str(e)}") from e
 
     def update_prov_yoy(self, prov_name):
         """