wyp
/
crossborder


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
							# ---------------------------- 核心解析逻辑修改 ----------------------------
from quanguo.CountryTradeYear import COUNTRY_CODE_MAPPING
from utils.parse_utils import clean_county_name


def chinese_class_to_number(class_str):
    """精确匹配中文类名"""
    cn_num_map = {
        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
        '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16,
        '十七': 17, '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22
    }
    match = re.match(r'^第([一二三四五六七八九十百千万]+)类$', class_str)
    return cn_num_map.get(match.group(1), 0) if match else 0


def parse_hscode(row_value):
    """增强版HS编码解析，支持更多格式"""
    global current_class

    # 清洗输入值（新增删除中文括号内容）
    cell_value = str(row_value).strip()
    cell_value = re.sub(r'\s+', ' ', cell_value)  # 统一空格
    cell_value = re.sub(r'[（(].*[）)]', '', cell_value)  # 删除括号内容
    if not cell_value:
        return None

    # 匹配中文类名（如 “第一类 动物”）
    class_cn_match = re.match(r'^第([一二三四五六七八九十百千万]+)类', cell_value)
    if class_cn_match:
        class_num = chinese_class_to_number(f"第{class_cn_match.group(1)}类")
        current_class = f"{class_num:02d}"
        return current_class

    # 匹配数字类名（如 "第5类 矿产品"）
    class_digit_match = re.match(r'^第\s*(\d+)\s*类', cell_value)
    if class_digit_match:
        class_num = class_digit_match.group(1)
        current_class = f"{int(class_num):02d}"
        return current_class

    # 匹配独立章名（如 "01章 活动物" 或 "第一章 工业制成品"）
    chapter_match = re.match(r'^(?:第)?\s*([一二三四五六七八九十百千万]+|\d{1,2})\s*章', cell_value)
    if chapter_match:
        chapter_str = chapter_match.group(1)

        if not chapter_str.isdigit():
            chapter_digit = chinese_class_to_number(chapter_str + "章")
        else:
            chapter_digit = int(chapter_str)

        if current_class:
            return f"{current_class}{chapter_digit:02d}"
        else:
            return f"{chapter_digit:02d}"

    return None


# ---------------------------- 完整代码集成 ----------------------------
import re
import openpyxl
from decimal import Decimal
import pandas as pd


# ---------------------------- 全局配置 ----------------------------
current_class = None
year_month = '2025-04'

def parse_value(value):
    """增强型数值解析，处理空值和特殊符号"""
    try:
        cleaned = str(value).strip().replace(',', '')
        # 处理空值和"-"
        if cleaned in ('', '-', 'NA', 'None'):
            return 0  # 关键修改：将"-"转换为0
        return Decimal(cleaned).quantize(Decimal('0.0000'))
    except Exception:
        return 0  # 所有异常情况返回0

def generate_country_sql(file_path, trade_type):
    """生成国家维度SQL脚本（支持数字类名）"""
    df = pd.read_excel(file_path, engine='xlrd')
    converted_path = f"converted_{trade_type}.xlsx"
    df.to_excel(converted_path, index=False)

    wb = openpyxl.load_workbook(converted_path)
    sheet = wb.active
    sql_scripts = []
    global current_class

    # 遍历行数据（从第7行开始）
    for row_idx, row in enumerate(sheet.iter_rows(min_row=7, values_only=True), start=7):
        if not row[1]:
            continue

        # 解析HS编码（核心修改）
        hs_code = parse_hscode(row[1])
        if not hs_code:
            print(f"[行{row_idx}] 忽略无效行：{row[1]}")
            continue

        # 国家数据处理（保持原逻辑）
        country_data = []
        for col_idx in range(2, len(row), 2):
            # 获取国家名称（假设国家名在表头第4行）
            country_name = sheet.cell(row=4, column=col_idx + 1).value
            if not country_name:
                continue

            country_name = clean_county_name(country_name)
            country_code = COUNTRY_CODE_MAPPING.get(country_name, 'XX')
            if  country_code == 'XX':
                # print(f"[行{row_idx}] 忽略未知国家：{country_name}")
                continue

            # 关键修改：读取国家对应的month和cumulative列
            month_val = parse_value(row[col_idx])
            cumulative_val = parse_value(row[col_idx + 1]) if col_idx + 1 < len(row) else 0

            country_data.append({
                'country_code': country_code,
                'country_name': country_name,
                'month': month_val,
                'cumulative': cumulative_val  # 添加累计值
            })

        # 生成SQL（新增数字类兼容）
        if country_data:
            try:
                for country in country_data:
                    # 使用国家专属的month/cumulative值（关键修改）
                    sql = f"""INSERT INTO `t_yujin_crossborder_commodity_country`
                    (`year_month`,`country_code`,`country_name`,`hs_code`,`trade_type`,
                    `month_amount`,`cumulative_amount`)
                    VALUES (
                        '{year_month}',
                        '{country['country_code']}',
                        '{country['country_name']}',
                        '{hs_code}',
                        '{trade_type}',
                        {country['month']},  # 使用国家维度数据
                        {country['cumulative']}  # 使用国家维度数据
                    ) ON DUPLICATE KEY UPDATE
                        month_amount = VALUES(month_amount),
                        cumulative_amount = VALUES(cumulative_amount);"""
                    sql_scripts.append(sql)
            except Exception as e:
                print(f"[行{row_idx}] 数据异常：{str(e)}")

    return sql_scripts


# 执行入口保持不变
if __name__ == "__main__":
    export_sql = generate_country_sql(r"C:/Users/admin/PyCharmMiscProject/cross/2025051809572148978.xls", "export")
    import_sql = generate_country_sql(r"C:/Users/admin/PyCharmMiscProject/cross/2025051809593876500.xls", "import")

    with open('../downloads/commodity_country04.sql', 'w', encoding='utf-8') as f:
        f.write('\n'.join(export_sql + import_sql))

    print(f"生成完成，出口{len(export_sql)}条，进口{len(import_sql)}条")