import re import pandas as pd from utils.parse_utils import clean_county_name # 配置参数 EXCEL_PATH = r"D:/Downloads/2025051809262394128.xls" OUTPUT_SQL = "../downloads/country_trade04.sql" EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲", "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织", "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"] COUNTRY_CODE_MAPPING = { # ================= 亚洲 ================= "阿富汗": "AF", "巴林": "BH", "孟加拉国": "BD", "不丹": "BT", "文莱": "BN", "缅甸": "MM", "柬埔寨": "KH", "塞浦路斯": "CY", "朝鲜": "KP", "中国香港": "HK", "印度": "IN", "印度尼西亚": "ID", "伊朗": "IR", "伊拉克": "IQ", "以色列": "IL", "日本": "JP", "约旦": "JO", "科威特": "KW", "老挝": "LA", "黎巴嫩": "LB", "中国澳门": "MO", "马来西亚": "MY", "马尔代夫": "MV", "蒙古": "MN", "尼泊尔": "NP", "阿曼": "OM", "巴基斯坦": "PK", "巴勒斯坦": "PS", "菲律宾": "PH", "卡塔尔": "QA", "沙特阿拉伯": "SA", "新加坡": "SG", "韩国": "KR", "斯里兰卡": "LK", "叙利亚": "SY", "泰国": "TH", "土耳其": "TR", "阿联酋": "AE", "也门": "YE", "越南": "VN", "中国": "CN", "中国台湾": "TW", "哈萨克斯坦": "KZ", "吉尔吉斯斯坦": "KG", "塔吉克斯坦": "TJ", "土库曼斯坦": "TM", "乌兹别克斯坦": "UZ", "格鲁吉亚": "GE", "亚美尼亚": "AM", "阿塞拜疆": "AZ", # ================= 非洲 ================= "阿尔及利亚": "DZ", "安哥拉": "AO", "贝宁": "BJ", "博茨瓦纳": "BW", "布隆迪": "BI", "喀麦隆": "CM", "佛得角": "CV", "中非": "CF", "乍得": "TD", "科摩罗": "KM", "刚果共和国": "CG", "吉布提": "DJ", "埃及": "EG", "赤道几内亚": "GQ", "埃塞俄比亚": "ET", "加蓬": "GA", "冈比亚": "GM", "加纳": "GH", "几内亚": "GN", "几内亚比绍": "GW", "科特迪瓦": "CI", "肯尼亚": "KE", "莱索托": "LS", "利比里亚": "LR", "利比亚": "LY", "马达加斯加": "MG", "马拉维": "MW", "马里": "ML", "毛里塔尼亚": "MR", "毛里求斯": "MU", "摩洛哥": "MA", "莫桑比克": "MZ", "纳米比亚": "NA", "尼日尔": "NE", "尼日利亚": "NG", "卢旺达": "RW", "圣多美和普林西比": "ST", "塞内加尔": "SN", "塞舌尔": "SC", "塞拉利昂": "SL", "索马里": "SO", "南非": "ZA", "苏丹": "SD", "坦桑尼亚": "TZ", "多哥": "TG", "突尼斯": "TN", "乌干达": "UG", "布基纳法索": "BF", "刚果民主共和国": "CD", "赞比亚": "ZM", "津巴布韦": "ZW", "厄立特里亚": "ER", "南苏丹": "SS", # ================= 欧洲 ================= "比利时": "BE", "丹麦": "DK", "英国": "GB", "德国": "DE", "法国": "FR", "爱尔兰": "IE", "意大利": "IT", "卢森堡": "LU", "荷兰": "NL", "希腊": "GR", "葡萄牙": "PT", "西班牙": "ES", "阿尔巴尼亚": "AL", "奥地利": "AT", "保加利亚": "BG", "芬兰": "FI", "匈牙利": "HU", "冰岛": "IS", "列支敦士登": "LI", "马耳他": "MT", "挪威": "NO", "波兰": "PL", "罗马尼亚": "RO", "瑞典": "SE", "瑞士": "CH", "爱沙尼亚": "EE", "拉脱维亚": "LV", "立陶宛": "LT", "白俄罗斯": "BY", "摩尔多瓦": "MD", "俄罗斯": "RU", "乌克兰": "UA", "斯洛文尼亚": "SI", "克罗地亚": "HR", "捷克": "CZ", "斯洛伐克": "SK", "北马其顿": "MK", "波斯尼亚和黑塞哥维那": "BA", "梵蒂冈": "VA", "塞尔维亚": "RS", "黑山": "ME", # ================= 美洲 ================= "安提瓜和巴布达": "AG", "阿根廷": "AR", "巴哈马": "BS", "巴巴多斯": "BB", "伯利兹": "BZ", "玻利维亚": "BO", "巴西": "BR", "加拿大": "CA", "智利": "CL", "哥伦比亚": "CO", "哥斯达黎加": "CR", "古巴": "CU", "多米尼克": "DM", "多米尼加": "DO", "厄瓜多尔": "EC", "萨尔瓦多": "SV", "格林纳达": "GD", "危地马拉": "GT", "圭亚那": "GY", "海地": "HT", "洪都拉斯": "HN", "牙买加": "JM", "墨西哥": "MX", "尼加拉瓜": "NI", "巴拿马": "PA", "巴拉圭": "PY", "秘鲁": "PE", "圣卢西亚": "LC", "圣文森特和格林纳丁斯": "VC", "苏里南": "SR", "特立尼达和多巴哥": "TT", "美国": "US", "乌拉圭": "UY", "委内瑞拉": "VE", "圣基茨和尼维斯": "KN", # ================= 大洋洲 ================= "澳大利亚": "AU", "斐济": "FJ", "基里巴斯": "KI", "马绍尔群岛": "MH", "密克罗尼西亚联邦": "FM", "瑙鲁": "NR", "新西兰": "NZ", "帕劳": "PW", "巴布亚新几内亚": "PG", "萨摩亚": "WS", "所罗门群岛": "SB", "汤加": "TO", "图瓦卢": "TV", "瓦努阿图": "VU", # ================= 特殊地区 ================= "法属圭亚那": "GF", "瓜德罗普": "GP", "留尼汪": "RE", "圣马丁": "MF", "荷属圣马丁": "SX", "法属波利尼西亚": "PF", "新喀里多尼亚": "NC", "库克群岛": "CK", "关岛": "GU", "波多黎各": "PR", "美属萨摩亚": "AS", "百慕大": "BM", "开曼群岛": "KY", "福克兰群岛(马尔维纳斯)": "FK", "格陵兰": "GL", "法属南方领地": "TF", "赫德岛和麦克唐纳岛": "HM", "托克劳": "TK", "纽埃": "NU", "诺福克岛": "NF", "北马里亚纳群岛": "MP", "皮特凯恩": "PN", "圣赫勒拿": "SH", "斯瓦尔巴群岛和扬马延岛": "SJ", "东帝汶": "TL", # ==== 欧洲特殊地区 ==== "加那利群岛": "IC", # 西班牙特殊领土代码 "塞卜泰(休达)": "XC", # 休达官方代码 "梅利利亚": "XL", # 梅利利亚官方代码 "安道尔": "AD", "直布罗陀": "GI", "摩纳哥": "MC", "圣马力诺": "SM", "法罗群岛": "FO", # 丹麦自治领 "奥兰群岛": "AX", # 芬兰自治省 "格恩西": "GG", # 英国皇家属地 "马恩岛": "IM", "泽西": "JE", # ==== 非洲特殊地区 ==== "西撒哈拉": "EH", # 争议地区代码 "斯威士兰": "SZ", # 正式国名为"Eswatini"但保留旧映射 "马约特": "YT", # 法国海外省 # ==== 美洲特殊地区 ==== "英属印度洋领地": "IO", "阿鲁巴": "AW", "库拉索": "CW", "马提尼克": "MQ", # 法国海外省 "蒙特塞拉特": "MS", "法属圣马丁": "MF", "特克斯和凯科斯群岛": "TC", "英属维尔京群岛": "VG", "博纳尔,圣俄斯塔休斯和萨巴": "BQ", "圣巴泰勒米": "BL", # 法国海外集体 "美属维尔京群岛": "VI", "安圭拉": "AI", "圣皮埃尔和密克隆": "PM", # ==== 大洋洲特殊地区 ==== "瓦利斯和富图纳": "WF", "科科斯(基林)群岛": "CC", "圣诞岛": "CX", "美国本土外小岛屿": "UM", # ==== 特殊标记 ==== "布维岛": "BV", # 挪威属地 "南乔治亚岛和南桑德韦奇岛": "GS", "国家(地区)不明": "XX" # 自定义代码 } # 在代码中添加例外处理 def get_country_code(chinese_name): """带异常处理的国家编码查询""" code = COUNTRY_CODE_MAPPING.get(chinese_name.strip(), None) # 特殊处理逻辑 if not code: if chinese_name.endswith("(地区)"): return "N/A" # 标记为地区 elif "国家联盟" in chinese_name: return "ORG" # 标记为国际组织 return code def parse_excel_to_sql(): """主处理函数""" # 读取Excel文件(注意需要xlrd 1.2.0版本) try: df = pd.read_excel(EXCEL_PATH, engine='xlrd', header=None, dtype=str) except ImportError: raise ImportError("需要xlrd库支持.xls文件,请安装:pip install xlrd==1.2.0") except Exception as e: raise ValueError(f"读取Excel文件失败:{str(e)}") # 解析年份月份 year_month = None data_start = None for idx, row in df.iterrows(): cell_value = str(row[1]) if "进出口商品国别" in cell_value: match = re.search(r"(\d{4})年(\d{1,2})月", cell_value) if match: year = match.group(1) month = match.group(2).zfill(2) year_month = f"{year}-{month}" # 调整起始行计算公式 data_start = idx + 6 # 原为+3,改为+4 break if not year_month or data_start is None: raise ValueError("无法解析年份月份信息,请检查Excel文件格式") # 读取数据区域 try: df_data = pd.read_excel(EXCEL_PATH, engine='xlrd', skiprows=data_start, header=None) except Exception as e: raise ValueError(f"读取数据区域失败:{str(e)}") # 列映射配置(根据实际Excel结构调整) COL_MAPPING = { "country_name": 1, "monthly_total": 2, "monthly_export": 4, "monthly_import": 6, "ytd_total": 3, "ytd_export": 5, "ytd_import": 7, "yoy_total": 8, "yoy_export": 9, "yoy_import": 10 } # 生成SQL语句 sql_statements = [] for index, row in df_data.iterrows(): try: # 获取国家名称 country_name = str(row[COL_MAPPING["country_name"]]).strip() country_name = clean_county_name(country_name) # 跳过空行和区域数据 if not country_name or country_name in EXCLUDE_REGIONS: continue # 获取国家代码 country_code = get_country_code(country_name) # 新增:排除地区和组织代码 if country_code in ['N/A', 'ORG', 'XX'] or not country_code: print(f"跳过地区/组织:{country_name}") continue if not country_code: print(f"警告:第{index + data_start + 1}行 未找到国家代码 [{country_name}]") continue # 数值处理函数(万元转元) # 修改后的数值处理函数 def convert_value(val): """增强版数值转换,处理所有异常情况""" try: # 处理特殊字符 cleaned = re.sub(r'[^0-9\.\-\+]', '', str(val)) # 处理空值和纯符号 if cleaned in ('', '-', '+'): return 0.0 # 处理科学计数法 if 'e' in cleaned.lower(): return float(cleaned) # 验证数值有效性 parts = cleaned.split('.') if len(parts) > 2: # 多个小数点 return 0.0 if '-' in cleaned and not cleaned.startswith('-'): # 负号位置错误 return 0.0 return float(cleaned) except: return 0.0 def safe_convert(val): """安全转换函数,处理特殊字符和空值""" # 处理空值和特殊占位符 if pd.isna(val) or str(val).strip() in ('', '-', 'NA'): return 'null' # 返回Python的None,对应SQL的NULL # 清理非数字字符 cleaned = re.sub(r'[^0-9\.\-]', '', str(val)) try: return round(float(cleaned), 2) except: return 'null' # 数据转换 values = { "year_month": year_month, "country_code": country_code, "country_name": country_name.replace("'", "''"), # 处理单引号 "monthly_total": convert_value(row[COL_MAPPING["monthly_total"]]), "monthly_export": convert_value(row[COL_MAPPING["monthly_export"]]), "monthly_import": convert_value(row[COL_MAPPING["monthly_import"]]), "ytd_total": convert_value(row[COL_MAPPING["ytd_total"]]), "ytd_export": convert_value(row[COL_MAPPING["ytd_export"]]), "ytd_import": convert_value(row[COL_MAPPING["ytd_import"]]), "yoy_total": safe_convert(row[COL_MAPPING["yoy_total"]]), "yoy_export": safe_convert(row[COL_MAPPING["yoy_export"]]), "yoy_import": safe_convert(row[COL_MAPPING["yoy_import"]]) } # 构建SQL语句 sql = f"""INSERT INTO t_yujin_crossborder_country_trade ( `year_month`, country_code, country_name, monthly_total, monthly_import, monthly_export, ytd_total, ytd_import, ytd_export, yoy_import_export, yoy_import, yoy_export ) VALUES ( '{values["year_month"]}', '{values["country_code"]}', '{values["country_name"]}', {values["monthly_total"]}, {values["monthly_import"]}, {values["monthly_export"]}, {values["ytd_total"]}, {values["ytd_import"]}, {values["ytd_export"]}, {values["yoy_total"]}, {values["yoy_export"]}, {values["yoy_import"]} );""" sql_statements.append(sql) except Exception as e: print(f"第{index + data_start + 1}行处理失败:{str(e)}") continue # 写入SQL文件 try: with open(OUTPUT_SQL, "w", encoding="utf-8") as f: f.write("\n".join(sql_statements)) print(f"成功生成 {len(sql_statements)} 条SQL语句,保存至:{OUTPUT_SQL}") except Exception as e: print(f"文件写入失败:{str(e)}") if __name__ == "__main__": try: parse_excel_to_sql() except Exception as e: print(f"程序执行失败:{str(e)}")