CountryTrade.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import re
  2. import pandas as pd
  3. from crossborder.utils.parse_utils import clean_county_name
  4. # 配置参数
  5. EXCEL_PATH = r"D:/Downloads/2025051809262394128.xls"
  6. OUTPUT_SQL = "../downloads/country_trade04.sql"
  7. EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
  8. "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",
  9. "区域全面经济伙伴关系协定(RCEP)成员国", "共建“一带一路”国家和地区"]
  10. COUNTRY_CODE_MAPPING = {
  11. # ================= 亚洲 =================
  12. "阿富汗": "AF",
  13. "巴林": "BH",
  14. "孟加拉国": "BD",
  15. "不丹": "BT",
  16. "文莱": "BN",
  17. "缅甸": "MM",
  18. "柬埔寨": "KH",
  19. "塞浦路斯": "CY",
  20. "朝鲜": "KP",
  21. "中国香港": "HK",
  22. "印度": "IN",
  23. "印度尼西亚": "ID",
  24. "伊朗": "IR",
  25. "伊拉克": "IQ",
  26. "以色列": "IL",
  27. "日本": "JP",
  28. "约旦": "JO",
  29. "科威特": "KW",
  30. "老挝": "LA",
  31. "黎巴嫩": "LB",
  32. "中国澳门": "MO",
  33. "马来西亚": "MY",
  34. "马尔代夫": "MV",
  35. "蒙古": "MN",
  36. "尼泊尔": "NP",
  37. "阿曼": "OM",
  38. "巴基斯坦": "PK",
  39. "巴勒斯坦": "PS",
  40. "菲律宾": "PH",
  41. "卡塔尔": "QA",
  42. "沙特阿拉伯": "SA",
  43. "新加坡": "SG",
  44. "韩国": "KR",
  45. "斯里兰卡": "LK",
  46. "叙利亚": "SY",
  47. "泰国": "TH",
  48. "土耳其": "TR",
  49. "阿联酋": "AE",
  50. "也门": "YE",
  51. "越南": "VN",
  52. "中国": "CN",
  53. "中国台湾": "TW",
  54. "哈萨克斯坦": "KZ",
  55. "吉尔吉斯斯坦": "KG",
  56. "塔吉克斯坦": "TJ",
  57. "土库曼斯坦": "TM",
  58. "乌兹别克斯坦": "UZ",
  59. "格鲁吉亚": "GE",
  60. "亚美尼亚": "AM",
  61. "阿塞拜疆": "AZ",
  62. # ================= 非洲 =================
  63. "阿尔及利亚": "DZ",
  64. "安哥拉": "AO",
  65. "贝宁": "BJ",
  66. "博茨瓦纳": "BW",
  67. "布隆迪": "BI",
  68. "喀麦隆": "CM",
  69. "佛得角": "CV",
  70. "中非": "CF",
  71. "乍得": "TD",
  72. "科摩罗": "KM",
  73. "刚果共和国": "CG",
  74. "吉布提": "DJ",
  75. "埃及": "EG",
  76. "赤道几内亚": "GQ",
  77. "埃塞俄比亚": "ET",
  78. "加蓬": "GA",
  79. "冈比亚": "GM",
  80. "加纳": "GH",
  81. "几内亚": "GN",
  82. "几内亚比绍": "GW",
  83. "科特迪瓦": "CI",
  84. "肯尼亚": "KE",
  85. "莱索托": "LS",
  86. "利比里亚": "LR",
  87. "利比亚": "LY",
  88. "马达加斯加": "MG",
  89. "马拉维": "MW",
  90. "马里": "ML",
  91. "毛里塔尼亚": "MR",
  92. "毛里求斯": "MU",
  93. "摩洛哥": "MA",
  94. "莫桑比克": "MZ",
  95. "纳米比亚": "NA",
  96. "尼日尔": "NE",
  97. "尼日利亚": "NG",
  98. "卢旺达": "RW",
  99. "圣多美和普林西比": "ST",
  100. "塞内加尔": "SN",
  101. "塞舌尔": "SC",
  102. "塞拉利昂": "SL",
  103. "索马里": "SO",
  104. "南非": "ZA",
  105. "苏丹": "SD",
  106. "坦桑尼亚": "TZ",
  107. "多哥": "TG",
  108. "突尼斯": "TN",
  109. "乌干达": "UG",
  110. "布基纳法索": "BF",
  111. "刚果民主共和国": "CD",
  112. "赞比亚": "ZM",
  113. "津巴布韦": "ZW",
  114. "厄立特里亚": "ER",
  115. "南苏丹": "SS",
  116. # ================= 欧洲 =================
  117. "比利时": "BE",
  118. "丹麦": "DK",
  119. "英国": "GB",
  120. "德国": "DE",
  121. "法国": "FR",
  122. "爱尔兰": "IE",
  123. "意大利": "IT",
  124. "卢森堡": "LU",
  125. "荷兰": "NL",
  126. "希腊": "GR",
  127. "葡萄牙": "PT",
  128. "西班牙": "ES",
  129. "阿尔巴尼亚": "AL",
  130. "奥地利": "AT",
  131. "保加利亚": "BG",
  132. "芬兰": "FI",
  133. "匈牙利": "HU",
  134. "冰岛": "IS",
  135. "列支敦士登": "LI",
  136. "马耳他": "MT",
  137. "挪威": "NO",
  138. "波兰": "PL",
  139. "罗马尼亚": "RO",
  140. "瑞典": "SE",
  141. "瑞士": "CH",
  142. "爱沙尼亚": "EE",
  143. "拉脱维亚": "LV",
  144. "立陶宛": "LT",
  145. "白俄罗斯": "BY",
  146. "摩尔多瓦": "MD",
  147. "俄罗斯": "RU",
  148. "乌克兰": "UA",
  149. "斯洛文尼亚": "SI",
  150. "克罗地亚": "HR",
  151. "捷克": "CZ",
  152. "斯洛伐克": "SK",
  153. "北马其顿": "MK",
  154. "波斯尼亚和黑塞哥维那": "BA",
  155. "梵蒂冈": "VA",
  156. "塞尔维亚": "RS",
  157. "黑山": "ME",
  158. # ================= 美洲 =================
  159. "安提瓜和巴布达": "AG",
  160. "阿根廷": "AR",
  161. "巴哈马": "BS",
  162. "巴巴多斯": "BB",
  163. "伯利兹": "BZ",
  164. "玻利维亚": "BO",
  165. "巴西": "BR",
  166. "加拿大": "CA",
  167. "智利": "CL",
  168. "哥伦比亚": "CO",
  169. "哥斯达黎加": "CR",
  170. "古巴": "CU",
  171. "多米尼克": "DM",
  172. "多米尼加": "DO",
  173. "厄瓜多尔": "EC",
  174. "萨尔瓦多": "SV",
  175. "格林纳达": "GD",
  176. "危地马拉": "GT",
  177. "圭亚那": "GY",
  178. "海地": "HT",
  179. "洪都拉斯": "HN",
  180. "牙买加": "JM",
  181. "墨西哥": "MX",
  182. "尼加拉瓜": "NI",
  183. "巴拿马": "PA",
  184. "巴拉圭": "PY",
  185. "秘鲁": "PE",
  186. "圣卢西亚": "LC",
  187. "圣文森特和格林纳丁斯": "VC",
  188. "苏里南": "SR",
  189. "特立尼达和多巴哥": "TT",
  190. "美国": "US",
  191. "乌拉圭": "UY",
  192. "委内瑞拉": "VE",
  193. "圣基茨和尼维斯": "KN",
  194. # ================= 大洋洲 =================
  195. "澳大利亚": "AU",
  196. "斐济": "FJ",
  197. "基里巴斯": "KI",
  198. "马绍尔群岛": "MH",
  199. "密克罗尼西亚联邦": "FM",
  200. "瑙鲁": "NR",
  201. "新西兰": "NZ",
  202. "帕劳": "PW",
  203. "巴布亚新几内亚": "PG",
  204. "萨摩亚": "WS",
  205. "所罗门群岛": "SB",
  206. "汤加": "TO",
  207. "图瓦卢": "TV",
  208. "瓦努阿图": "VU",
  209. # ================= 特殊地区 =================
  210. "法属圭亚那": "GF",
  211. "瓜德罗普": "GP",
  212. "留尼汪": "RE",
  213. "圣马丁": "MF",
  214. "荷属圣马丁": "SX",
  215. "法属波利尼西亚": "PF",
  216. "新喀里多尼亚": "NC",
  217. "库克群岛": "CK",
  218. "关岛": "GU",
  219. "波多黎各": "PR",
  220. "美属萨摩亚": "AS",
  221. "百慕大": "BM",
  222. "开曼群岛": "KY",
  223. "福克兰群岛(马尔维纳斯)": "FK",
  224. "格陵兰": "GL",
  225. "法属南方领地": "TF",
  226. "赫德岛和麦克唐纳岛": "HM",
  227. "托克劳": "TK",
  228. "纽埃": "NU",
  229. "诺福克岛": "NF",
  230. "北马里亚纳群岛": "MP",
  231. "皮特凯恩": "PN",
  232. "圣赫勒拿": "SH",
  233. "斯瓦尔巴群岛和扬马延岛": "SJ",
  234. "东帝汶": "TL",
  235. # ==== 欧洲特殊地区 ====
  236. "加那利群岛": "IC", # 西班牙特殊领土代码
  237. "塞卜泰(休达)": "XC", # 休达官方代码
  238. "梅利利亚": "XL", # 梅利利亚官方代码
  239. "安道尔": "AD",
  240. "直布罗陀": "GI",
  241. "摩纳哥": "MC",
  242. "圣马力诺": "SM",
  243. "法罗群岛": "FO", # 丹麦自治领
  244. "奥兰群岛": "AX", # 芬兰自治省
  245. "格恩西": "GG", # 英国皇家属地
  246. "马恩岛": "IM",
  247. "泽西": "JE",
  248. # ==== 非洲特殊地区 ====
  249. "西撒哈拉": "EH", # 争议地区代码
  250. "斯威士兰": "SZ", # 正式国名为"Eswatini"但保留旧映射
  251. "马约特": "YT", # 法国海外省
  252. # ==== 美洲特殊地区 ====
  253. "英属印度洋领地": "IO",
  254. "阿鲁巴": "AW",
  255. "库拉索": "CW",
  256. "马提尼克": "MQ", # 法国海外省
  257. "蒙特塞拉特": "MS",
  258. "法属圣马丁": "MF",
  259. "特克斯和凯科斯群岛": "TC",
  260. "英属维尔京群岛": "VG",
  261. "博纳尔,圣俄斯塔休斯和萨巴": "BQ",
  262. "圣巴泰勒米": "BL", # 法国海外集体
  263. "美属维尔京群岛": "VI",
  264. "安圭拉": "AI",
  265. "圣皮埃尔和密克隆": "PM",
  266. # ==== 大洋洲特殊地区 ====
  267. "瓦利斯和富图纳": "WF",
  268. "科科斯(基林)群岛": "CC",
  269. "圣诞岛": "CX",
  270. "美国本土外小岛屿": "UM",
  271. # ==== 特殊标记 ====
  272. "布维岛": "BV", # 挪威属地
  273. "南乔治亚岛和南桑德韦奇岛": "GS",
  274. "国家(地区)不明": "XX" # 自定义代码
  275. }
  276. # 在代码中添加例外处理
  277. def get_country_code(chinese_name):
  278. """带异常处理的国家编码查询"""
  279. code = COUNTRY_CODE_MAPPING.get(chinese_name.strip(), None)
  280. # 特殊处理逻辑
  281. if not code:
  282. if chinese_name.endswith("(地区)"):
  283. return "N/A" # 标记为地区
  284. elif "国家联盟" in chinese_name:
  285. return "ORG" # 标记为国际组织
  286. return code
  287. def parse_excel_to_sql():
  288. """主处理函数"""
  289. # 读取Excel文件(注意需要xlrd 1.2.0版本)
  290. try:
  291. df = pd.read_excel(EXCEL_PATH, engine='xlrd', header=None, dtype=str)
  292. except ImportError:
  293. raise ImportError("需要xlrd库支持.xls文件,请安装:pip install xlrd==1.2.0")
  294. except Exception as e:
  295. raise ValueError(f"读取Excel文件失败:{str(e)}")
  296. # 解析年份月份
  297. year_month = None
  298. data_start = None
  299. for idx, row in df.iterrows():
  300. cell_value = str(row[1])
  301. if "进出口商品国别" in cell_value:
  302. match = re.search(r"(\d{4})年(\d{1,2})月", cell_value)
  303. if match:
  304. year = match.group(1)
  305. month = match.group(2).zfill(2)
  306. year_month = f"{year}-{month}"
  307. # 调整起始行计算公式
  308. data_start = idx + 6 # 原为+3,改为+4
  309. break
  310. if not year_month or data_start is None:
  311. raise ValueError("无法解析年份月份信息,请检查Excel文件格式")
  312. # 读取数据区域
  313. try:
  314. df_data = pd.read_excel(EXCEL_PATH, engine='xlrd', skiprows=data_start, header=None)
  315. except Exception as e:
  316. raise ValueError(f"读取数据区域失败:{str(e)}")
  317. # 列映射配置(根据实际Excel结构调整)
  318. COL_MAPPING = {
  319. "country_name": 1,
  320. "monthly_total": 2,
  321. "monthly_export": 4,
  322. "monthly_import": 6,
  323. "ytd_total": 3,
  324. "ytd_export": 5,
  325. "ytd_import": 7,
  326. "yoy_total": 8,
  327. "yoy_export": 9,
  328. "yoy_import": 10
  329. }
  330. # 生成SQL语句
  331. sql_statements = []
  332. for index, row in df_data.iterrows():
  333. try:
  334. # 获取国家名称
  335. country_name = str(row[COL_MAPPING["country_name"]]).strip()
  336. country_name = clean_county_name(country_name)
  337. # 跳过空行和区域数据
  338. if not country_name or country_name in EXCLUDE_REGIONS:
  339. continue
  340. # 获取国家代码
  341. country_code = get_country_code(country_name)
  342. # 新增:排除地区和组织代码
  343. if country_code in ['N/A', 'ORG', 'XX'] or not country_code:
  344. print(f"跳过地区/组织:{country_name}")
  345. continue
  346. if not country_code:
  347. print(f"警告:第{index + data_start + 1}行 未找到国家代码 [{country_name}]")
  348. continue
  349. # 数值处理函数(万元转元)
  350. # 修改后的数值处理函数
  351. def convert_value(val):
  352. """增强版数值转换,处理所有异常情况"""
  353. try:
  354. # 处理特殊字符
  355. cleaned = re.sub(r'[^0-9\.\-\+]', '', str(val))
  356. # 处理空值和纯符号
  357. if cleaned in ('', '-', '+'):
  358. return 0.0
  359. # 处理科学计数法
  360. if 'e' in cleaned.lower():
  361. return float(cleaned)
  362. # 验证数值有效性
  363. parts = cleaned.split('.')
  364. if len(parts) > 2: # 多个小数点
  365. return 0.0
  366. if '-' in cleaned and not cleaned.startswith('-'): # 负号位置错误
  367. return 0.0
  368. return float(cleaned)
  369. except:
  370. return 0.0
  371. def safe_convert(val):
  372. """安全转换函数,处理特殊字符和空值"""
  373. # 处理空值和特殊占位符
  374. if pd.isna(val) or str(val).strip() in ('', '-', 'NA'):
  375. return 'null' # 返回Python的None,对应SQL的NULL
  376. # 清理非数字字符
  377. cleaned = re.sub(r'[^0-9\.\-]', '', str(val))
  378. try:
  379. return round(float(cleaned), 2)
  380. except:
  381. return 'null'
  382. # 数据转换
  383. values = {
  384. "year_month": year_month,
  385. "country_code": country_code,
  386. "country_name": country_name.replace("'", "''"), # 处理单引号
  387. "monthly_total": convert_value(row[COL_MAPPING["monthly_total"]]),
  388. "monthly_export": convert_value(row[COL_MAPPING["monthly_export"]]),
  389. "monthly_import": convert_value(row[COL_MAPPING["monthly_import"]]),
  390. "ytd_total": convert_value(row[COL_MAPPING["ytd_total"]]),
  391. "ytd_export": convert_value(row[COL_MAPPING["ytd_export"]]),
  392. "ytd_import": convert_value(row[COL_MAPPING["ytd_import"]]),
  393. "yoy_total": safe_convert(row[COL_MAPPING["yoy_total"]]),
  394. "yoy_export": safe_convert(row[COL_MAPPING["yoy_export"]]),
  395. "yoy_import": safe_convert(row[COL_MAPPING["yoy_import"]])
  396. }
  397. # 构建SQL语句
  398. sql = f"""INSERT INTO t_yujin_crossborder_country_trade (
  399. `year_month`, country_code, country_name,
  400. monthly_total, monthly_import, monthly_export,
  401. ytd_total, ytd_import, ytd_export,
  402. yoy_import_export, yoy_import, yoy_export
  403. ) VALUES (
  404. '{values["year_month"]}',
  405. '{values["country_code"]}',
  406. '{values["country_name"]}',
  407. {values["monthly_total"]},
  408. {values["monthly_import"]},
  409. {values["monthly_export"]},
  410. {values["ytd_total"]},
  411. {values["ytd_import"]},
  412. {values["ytd_export"]},
  413. {values["yoy_total"]},
  414. {values["yoy_export"]},
  415. {values["yoy_import"]}
  416. );"""
  417. sql_statements.append(sql)
  418. except Exception as e:
  419. print(f"第{index + data_start + 1}行处理失败:{str(e)}")
  420. continue
  421. # 写入SQL文件
  422. try:
  423. with open(OUTPUT_SQL, "w", encoding="utf-8") as f:
  424. f.write("\n".join(sql_statements))
  425. print(f"成功生成 {len(sql_statements)} 条SQL语句,保存至:{OUTPUT_SQL}")
  426. except Exception as e:
  427. print(f"文件写入失败:{str(e)}")
  428. if __name__ == "__main__":
  429. try:
  430. parse_excel_to_sql()
  431. except Exception as e:
  432. print(f"程序执行失败:{str(e)}")