Bläddra i källkod

crawl data fix

zhangfan 1 månad sedan
förälder
incheckning
eeb4a70678

+ 4 - 4
crossborder/cli.py

@@ -45,10 +45,10 @@ def main():
     if getattr(sys, 'frozen', False):
         # 打包后的exe路径
         application_path = os.path.dirname(sys.executable)
-    else:
-        # 开发环境路径
-        application_path = os.path.dirname(os.path.abspath(__file__))
-    os.chdir(application_path)
+    # else:
+    #     # 开发环境路径
+    #     application_path = os.path.dirname(os.path.abspath(__file__))
+        os.chdir(application_path)
 
     if len(sys.argv) == 1:
         sys.argv = ['run-crossborder.exe', '--all']

+ 1 - 0
crossborder/hebei/__init__.py

@@ -4,6 +4,7 @@ from pathlib import Path
 script_dir = os.getcwd()
 parent_dir = os.path.dirname(script_dir)
 download_dir = os.path.join(parent_dir, 'hebei')
+# download_dir = os.path.join(script_dir, 'downloads', 'hebei')
 # 创建目录(如果不存在)
 os.makedirs(download_dir, exist_ok=True)
 # 切换当前工作目录到 download_dir

+ 1 - 0
crossborder/jiangsu/__init__.py

@@ -4,6 +4,7 @@ from pathlib import Path
 script_dir = os.getcwd()
 parent_dir = os.path.dirname(script_dir)
 download_dir = os.path.join(parent_dir, 'jiangsu')
+# download_dir = os.path.join(script_dir, 'downloads', 'jiangsu')
 # 创建目录(如果不存在)
 os.makedirs(download_dir, exist_ok=True)
 # 切换当前工作目录到 download_dir

+ 6 - 6
crossborder/jiangsu/gov_commodity_jiangsu_import_export.py

@@ -61,17 +61,17 @@ def process_folder(path, all_records):
 
                 temp_df = df[[1, 6]].rename(columns={1: 'commodity', 6: 'import'})
                 temp_df['import'] = pd.to_numeric(temp_df['import'].replace('--', 0), errors='coerce')
-                temp_df['import'] = temp_df['import'] * 10
+                temp_df['import'] = temp_df['import'] / 10
                 import_df = pd.concat([import_df, temp_df])
 
                 temp_df = df[[1, 4]].rename(columns={1: 'commodity', 4: 'export'})
                 temp_df['export'] = pd.to_numeric(temp_df['export'].replace('--', 0), errors='coerce')
-                temp_df['export'] = temp_df['export'] * 10
+                temp_df['export'] = temp_df['export'] / 10
                 export_df = pd.concat([export_df, temp_df])
 
                 temp_df = df[[1, 2]].rename(columns={1: 'commodity', 2: 'total'})
                 temp_df['total'] = pd.to_numeric(temp_df['total'].replace('--', 0), errors='coerce')
-                temp_df['total'] = temp_df['total'] * 10
+                temp_df['total'] = temp_df['total'] / 10
                 total_df = pd.concat([total_df, temp_df])
                 break
 
@@ -165,8 +165,8 @@ def hierarchical_traversal(root_path, all_records):
 
 if __name__ == '__main__':
     all_records = base_mysql.get_hs_all()
-    # hierarchical_traversal(download_dir, all_records)
+    hierarchical_traversal(download_dir, all_records)
 
-    root = Path(download_dir)/'2023'/'01'
-    process_folder(root, all_records)
+    # root = Path(download_dir)/'2023'/'01'
+    # process_folder(root, all_records)
     print("江苏省海关类章所有文件处理完成!")

+ 3 - 2
crossborder/utils/base_mysql.py

@@ -337,8 +337,8 @@ def _update_shandong_new_yoy_origin(region_name):
         return result.rowcount
 
 if __name__ == '__main__':
-    commodity_code, commodity_name_fix = get_commodity_id('农产品')
-    print(commodity_code, commodity_name_fix)
+    # commodity_code, commodity_name_fix = get_commodity_id('农产品')
+    # print(commodity_code, commodity_name_fix)
     # check_year, check_month = 2024, 4
     # count = get_code_exist(f'{check_year}-{check_month:02d}', "340000")
     # print(count)
@@ -346,6 +346,7 @@ if __name__ == '__main__':
     # 新表更新地级市同比
     # for province in provinces:
     #     update_shandong_yoy(province)
+    update_shandong_yoy('浙江省')
 
     # 旧表更新省份同比
     # for province in provinces:

+ 1 - 0
crossborder/zhejiang/__init__.py

@@ -4,6 +4,7 @@ from pathlib import Path
 script_dir = os.getcwd()
 parent_dir = os.path.dirname(script_dir)
 download_dir = os.path.join(parent_dir, 'zhejiang')
+# download_dir = os.path.join(script_dir, 'downloads', 'zhejiang')
 # 创建目录(如果不存在)
 os.makedirs(download_dir, exist_ok=True)
 # 切换当前工作目录到 download_dir

+ 27 - 18
crossborder/zhejiang/gov_commodity_zhejiang_country.py

@@ -5,7 +5,7 @@ import pandas as pd
 from crossborder.zhejiang import download_dir
 from crossborder.utils import base_country_code, base_mysql
 from crossborder.utils.base_country_code import format_sql_value
-from crossborder.utils.log import  get_logger
+from crossborder.utils.log import get_logger
 
 log = get_logger(__name__)
 
@@ -88,23 +88,33 @@ def process_folder(path):
         prev_export = prev_export_df.groupby('commodity')['export'].sum().reset_index()
         prev_total_df = prev_total_df.groupby('commodity')['total'].sum().reset_index()
 
-        # 差值计算
-        curr_import = pd.merge(curr_import, prev_import, on='commodity', how='left')
-        curr_import['import'] = round(curr_import['import_x'] - curr_import['import_y'], 4)
+        # 新增字段标准化逻辑
+        curr_import['commodity'] = curr_import['commodity'].str.strip().str.split('(|\\(').str[0]
+        prev_import['commodity'] = prev_import['commodity'].str.strip().str.split('(|\\(').str[0]
+        curr_export['commodity'] = curr_export['commodity'].str.strip().str.split('(|\\(').str[0]
+        prev_export['commodity'] = prev_export['commodity'].str.strip().str.split('(|\\(').str[0]
+        total_df['commodity'] = total_df['commodity'].str.strip().str.split('(|\\(').str[0]
+        prev_total_df['commodity'] = prev_total_df['commodity'].str.strip().str.split('(|\\(').str[0]
 
-        curr_export = pd.merge(curr_export, prev_export, on='commodity', how='left')
-        curr_export['export'] = round(curr_export['export_x'] - curr_export['export_y'], 4)
+        # 差值计算优化 - 开始
+        curr_import = pd.merge(curr_import, prev_import, on='commodity', how='left').fillna(0)
+        curr_import['import'] = (curr_import['import_x'] - curr_import['import_y']).round(4)
+
+        curr_export = pd.merge(curr_export, prev_export, on='commodity', how='left').fillna(0)
+        curr_export['export'] = (curr_export['export_x'] - curr_export['export_y']).round(4)
+
+        total_df = pd.merge(total_df, prev_total_df, on='commodity', how='left').fillna(0)
+        total_df['total'] = (total_df['total_x'] - total_df['total_y']).round(4)
+        # 差值计算优化 - 结束
 
-        total_df = pd.merge(total_df, prev_total_df, on='commodity', how='left')
-        total_df['total'] = round(total_df['total_x'] - total_df['total_y'], 4)
         log.info(f"合并文件: {path}*********{previous_month_dir}")
 
-    # 合并进出口数据
-    merged_df = pd.merge(curr_import, curr_export, on='commodity', how='outer')
-    merged_df = pd.merge(merged_df, total_df, on='commodity', how='outer')
+    # 合并进出口数据优化 - 开始
+    merged_df = pd.merge(curr_import, curr_export, on='commodity', how='outer').fillna(0)
+    merged_df = pd.merge(merged_df, total_df, on='commodity', how='outer').fillna(0)
+    # 合并进出口数据优化 - 结束
 
     sql_arr = []
-    # try:
     for _, row in merged_df.iterrows():
         country_name = str(row['commodity']).strip()
         if country_name.endswith(")") or country_name.endswith(")"):
@@ -138,8 +148,6 @@ def process_folder(path):
             f"'{yoy_export}', NOW()) ON DUPLICATE KEY UPDATE create_time = now();"
         )
         sql_arr.append(sql)
-    # except Exception as e:
-    #     log.info(f"{year_month} 处理时发生异常: {str(e)}")
 
     log.info(f"√ {year_month} 成功生成 SQL 条数: {len(sql_arr)}")
     # 批量插入数据库
@@ -168,8 +176,9 @@ def hierarchical_traversal(root_path):
 
 
 if __name__ == '__main__':
-    # hierarchical_traversal(download_dir)
+    hierarchical_traversal(download_dir)
+
+    # root = Path(download_dir)/'2024'/'10'
+    # process_folder(root)
 
-    root = Path(download_dir) / '2024' / '07'
-    process_folder(root)
-    log.info("浙江省海关国别所有文件处理完成!")
+    log.info("浙江省海关国别所有文件处理完成!")