Bladeren bron

crawl path update2

zhangfan 1 dag geleden
bovenliggende
commit
3cc7b4df15
37 gewijzigde bestanden met toevoegingen van 168 en 176 verwijderingen
  1. 15 17
      crossborder/anhui/crawl_gov_anhui_full.py
  2. 4 4
      crossborder/anhui/gov_commodity_anhui_city.py
  3. 4 4
      crossborder/anhui/gov_commodity_anhui_country.py
  4. 4 4
      crossborder/anhui/gov_commodity_anhui_import_export.py
  5. 4 4
      crossborder/auto_incre_main.py
  6. 3 3
      crossborder/fujian/fujian_parse_excel.py
  7. 3 3
      crossborder/fujian/selenium_fujian_download.py
  8. 3 3
      crossborder/guangdong/guangdong_gongbei_parse_excel.py
  9. 5 5
      crossborder/guangdong/guangdong_sub_customs_parse_excel.py
  10. 6 6
      crossborder/guangdong/selenium_guangdong_city.py
  11. 5 5
      crossborder/guangdong/selenium_guangdong_download.py
  12. 16 17
      crossborder/hebei/crawl_gov_hebei_full.py
  13. 4 4
      crossborder/hebei/gov_commodity_hebei_city.py
  14. 4 4
      crossborder/hebei/gov_commodity_hebei_country.py
  15. 4 4
      crossborder/hebei/gov_commodity_hebei_import_export.py
  16. 3 3
      crossborder/henan/henan_parse_excel.py
  17. 3 3
      crossborder/henan/selenium_henan_download.py
  18. 16 19
      crossborder/jiangsu/crawl_gov_jiangsu_full.py
  19. 4 4
      crossborder/jiangsu/gov_commodity_jiangsu_city.py
  20. 4 4
      crossborder/jiangsu/gov_commodity_jiangsu_country.py
  21. 3 3
      crossborder/jiangsu/gov_commodity_jiangsu_import_export.py
  22. 1 1
      crossborder/quanguo/CountryTrade.py
  23. 1 1
      crossborder/quanguo/CountryTradeYear.py
  24. 1 1
      crossborder/quanguo/detail.py
  25. 3 3
      crossborder/quanguo/selenium_download.py
  26. 5 5
      crossborder/shandong/selenium_shandong_download.py
  27. 4 4
      crossborder/shandong/shandong_parse_excel.py
  28. 1 1
      crossborder/utils/base_country_code.py
  29. 1 1
      crossborder/utils/base_mysql.py
  30. 2 2
      crossborder/utils/crawl_gov_commodity.py
  31. 1 1
      crossborder/utils/db_helper.py
  32. 1 1
      crossborder/utils/download_utils.py
  33. 1 1
      crossborder/utils/parse_utils.py
  34. 17 19
      crossborder/zhejiang/crawl_gov_zhejiang_full.py
  35. 4 4
      crossborder/zhejiang/gov_commodity_zhejiang_city.py
  36. 4 4
      crossborder/zhejiang/gov_commodity_zhejiang_country.py
  37. 4 4
      crossborder/zhejiang/gov_commodity_zhejiang_import_export.py

+ 15 - 17
crossborder/anhui/crawl_gov_anhui_full.py

@@ -1,3 +1,4 @@
+import argparse
 import os
 import random
 import re
@@ -12,11 +13,11 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from anhui import gov_commodity_anhui_city, download_dir
-from anhui import gov_commodity_anhui_country
-from anhui import gov_commodity_anhui_import_export
-from utils import base_country_code, base_mysql
-from utils.log import log
+from crossborder.anhui import gov_commodity_anhui_city, download_dir
+from crossborder.anhui import gov_commodity_anhui_country
+from crossborder.anhui import gov_commodity_anhui_import_export
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.log import log
 
 def configure_stealth_options():
     """增强型反检测配置[1,4](@ref)"""
@@ -194,7 +195,7 @@ def crawl_with_selenium(url, mark):
     driver = webdriver.Firefox(options=configure_stealth_options())
 
     year_month = None
-    if 'increment' == mark:
+    if 'auto' == mark:
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("安徽省海关没有最新数据更新")
@@ -319,17 +320,14 @@ def hierarchical_traversal(root_path):
                 gov_commodity_anhui_city.process_folder(md['path'])
 
 def main():
-    crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'all')
-    # crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html', 'increment')
-    # print(f"安徽合肥海关全量数据下载任务完成")
-    # # 等待5s后执行
-    # time.sleep(5)
-    # hierarchical_traversal(base_country_code.download_dir)
-    # print("安徽合肥海关类章、国家、城市所有文件处理完成!")
-    # time.sleep(5)
-    # base_mysql.update_january_yoy('安徽省')
-    # base_mysql.update_shandong_yoy('安徽省')
-    # print("安徽合肥海关城市同比sql处理完成")
+    parser = argparse.ArgumentParser(description="爬取模式: 全量(all) 或 增量(auto)")
+    parser.add_argument("mode", choices=["all", "auto"], help="运行模式")
+    args = parser.parse_args()
+
+    if args.mode == "all":
+        crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
+    else:
+        crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
 
 if __name__ == '__main__':
     main()

+ 4 - 4
crossborder/anhui/gov_commodity_anhui_city.py

@@ -2,10 +2,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from anhui import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.anhui import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 city_code_map = {
     "安徽省合肥市": "340100",

+ 4 - 4
crossborder/anhui/gov_commodity_anhui_country.py

@@ -2,10 +2,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from anhui import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.anhui import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 # 排除地区名单
 EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",

+ 4 - 4
crossborder/anhui/gov_commodity_anhui_import_export.py

@@ -3,11 +3,11 @@ from pathlib import Path
 
 import pandas as pd
 
-from anhui import download_dir
-from utils import base_country_code, base_mysql
+from crossborder.anhui import download_dir
+from crossborder.utils import base_country_code, base_mysql
 
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 CUSTOM_COMMODITY_REPLACEMENTS = {
     '家具': '家具及其零件',

+ 4 - 4
crossborder/auto_incre_main.py

@@ -1,7 +1,7 @@
-from anhui import crawl_gov_anhui_full
-from hebei import crawl_gov_hebei_full
-from jiangsu import crawl_gov_jiangsu_full
-from zhejiang import crawl_gov_zhejiang_full
+from crossborder.anhui import crawl_gov_anhui_full
+from crossborder.hebei import crawl_gov_hebei_full
+from crossborder.jiangsu import crawl_gov_jiangsu_full
+from crossborder.zhejiang import crawl_gov_zhejiang_full
 from fujian import selenium_fujian_download
 from henan import selenium_henan_download
 from shandong import selenium_shandong_download

+ 3 - 3
crossborder/fujian/fujian_parse_excel.py

@@ -3,9 +3,9 @@ from pathlib import Path
 
 import pandas as pd
 
-from utils.db_helper import DBHelper
-from utils.constants import DOWNLOAD_DIR
-from utils.parse_utils import convert_wan_to_yuan, extract_year_month_from_path, traverse_and_process
+from crossborder.utils.db_helper import DBHelper
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.parse_utils import convert_wan_to_yuan, extract_year_month_from_path, traverse_and_process
 
 FUJIAN_CITY = {
 "福州市": "350100",

+ 3 - 3
crossborder/fujian/selenium_fujian_download.py

@@ -10,9 +10,9 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 from fujian.fujian_parse_excel import parse_excel
-from utils.constants import DOWNLOAD_DIR
-from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
-from utils.parse_utils import traverse_and_process
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
+from crossborder.utils.parse_utils import traverse_and_process
 
 # 基础配置
 

+ 3 - 3
crossborder/guangdong/guangdong_gongbei_parse_excel.py

@@ -3,10 +3,10 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.db_helper import DBHelper
+from crossborder.utils.db_helper import DBHelper
 from quanguo.detail import parse_value
-from utils.constants import GUANGDONG_CITY
-from utils.log import log
+from crossborder.utils.constants import GUANGDONG_CITY
+from crossborder.utils.log import log
 
 PROV_CODE = "440000"
 PROV_NAME = "广东省"

+ 5 - 5
crossborder/guangdong/guangdong_sub_customs_parse_excel.py

@@ -3,10 +3,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from utils.db_helper import DBHelper
-from utils.constants import DOWNLOAD_DIR, GUANGDONG_CITY
-from utils.log import log
-from utils.parse_utils import traverse_and_process, extract_year_month_from_path, get_previous_month_dir
+from crossborder.utils.db_helper import DBHelper
+from crossborder.utils.constants import DOWNLOAD_DIR, GUANGDONG_CITY
+from crossborder.utils.log import log
+from crossborder.utils.parse_utils import traverse_and_process, extract_year_month_from_path, get_previous_month_dir
 
 # 配置日志
 PROV_CODE = "440000"
@@ -643,7 +643,7 @@ def parse_excel(current_dir):
         # log.debug(f"处理后数据示例:\n{final_df.head()}")
 
         # 这里调用DBHelper入库(实际使用时请取消注释)
-        from utils.db_helper import DBHelper
+        from crossborder.utils.db_helper import DBHelper
         db = DBHelper()
         db.bulk_insert(
             final_df,

+ 6 - 6
crossborder/guangdong/selenium_guangdong_city.py

@@ -10,15 +10,15 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.db_helper import DBHelper
+from crossborder.utils.db_helper import DBHelper
 from guangdong.guangdong_gongbei_parse_excel import parse_region_table, calculate_monthly_data
 from guangdong.guangdong_sub_customs_parse_excel import parse_excel
-from utils.constants import DOWNLOAD_DIR
-from utils.constants import GUANGDONG_CUSTOMS_URL
-from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel, download_excel2, \
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.constants import GUANGDONG_CUSTOMS_URL
+from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel, download_excel2, \
     batch_download_excel
-from utils.log import log
-from utils.parse_utils import traverse_and_process
+from crossborder.utils.log import log
+from crossborder.utils.parse_utils import traverse_and_process
 
 download_dir = DOWNLOAD_DIR / "guangdong"
 

+ 5 - 5
crossborder/guangdong/selenium_guangdong_download.py

@@ -11,11 +11,11 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.db_helper import DBHelper
-from utils.constants import DOWNLOAD_DIR, COUNTRY_CODE_MAPPING
-from utils.download_utils import configure_stealth_options, generate_month_sequence
-from utils.log import log
-from utils.parse_utils import clean_county_name, convert_wan_to_yuan, clean_commodity_name
+from crossborder.utils.db_helper import DBHelper
+from crossborder.utils.constants import DOWNLOAD_DIR, COUNTRY_CODE_MAPPING
+from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence
+from crossborder.utils.log import log
+from crossborder.utils.parse_utils import clean_county_name, convert_wan_to_yuan, clean_commodity_name
 
 
 BASE_URL = "http://gdfs.customs.gov.cn/guangdong_sub/zwgk62/sjgb59/6b4cdb3f-1.html"

+ 16 - 17
crossborder/hebei/crawl_gov_hebei_full.py

@@ -1,3 +1,4 @@
+import argparse
 import os
 import random
 import re
@@ -12,12 +13,12 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from hebei import download_dir
-from hebei import gov_commodity_hebei_city
-from hebei import gov_commodity_hebei_country
-from hebei import gov_commodity_hebei_import_export
-from utils import base_country_code, base_mysql
-from utils.log import log
+from crossborder.hebei import download_dir
+from crossborder.hebei import gov_commodity_hebei_city
+from crossborder.hebei import gov_commodity_hebei_country
+from crossborder.hebei import gov_commodity_hebei_import_export
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.log import log
 
 
 def get_current_target_titles():
@@ -172,7 +173,7 @@ def crawl_with_selenium(url, mark):
     driver = webdriver.Firefox(options=configure_stealth_options())
 
     year_month = None
-    if 'increment' == mark:
+    if 'auto' == mark:
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("河北省海关没有最新数据更新")
@@ -296,16 +297,14 @@ def hierarchical_traversal(root_path):
 
 
 def main():
-    # crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'all')
-    crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'increment')
-    # 等待5s后执行
-    # time.sleep(5)
-    # hierarchical_traversal(base_country_code.download_dir)
-    # log.info(f"河北石家庄海关全量数据下载任务完成")
-    # time.sleep(5)
-    # base_mysql.update_january_yoy('河北省')
-    # base_mysql.update_shandong_yoy('河北省')
-    # log.info("河北石家庄海关城市同比sql处理完成")
+    parser = argparse.ArgumentParser(description="爬取模式: 全量(all) 或 增量(auto)")
+    parser.add_argument("mode", choices=["all", "auto"], help="运行模式")
+    args = parser.parse_args()
+
+    if args.mode == "all":
+        crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'all')
+    else:
+        crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html','auto')
 
 if __name__ == '__main__':
     main()

+ 4 - 4
crossborder/hebei/gov_commodity_hebei_city.py

@@ -3,10 +3,10 @@ from pathlib import Path
 import pandas
 import pandas as pd
 
-from hebei import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.hebei import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 city_code_map = {
     "石家庄市": "130100",

+ 4 - 4
crossborder/hebei/gov_commodity_hebei_country.py

@@ -3,10 +3,10 @@ from pathlib import Path
 import pandas
 import pandas as pd
 
-from hebei import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.hebei import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",
                    "东南亚国家联盟", "欧洲联盟", "亚太经济合作组织",

+ 4 - 4
crossborder/hebei/gov_commodity_hebei_import_export.py

@@ -2,11 +2,11 @@ from pathlib import Path
 
 import pandas as pd
 import re
-from utils.log import log
+from crossborder.utils.log import log
 
-from hebei import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
+from crossborder.hebei import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
 
 CUSTOM_COMMODITY_REPLACEMENTS = {
     '稻谷及大米': '稻谷、大米及大米粉',

+ 3 - 3
crossborder/henan/henan_parse_excel.py

@@ -3,9 +3,9 @@ from pathlib import Path
 
 import pandas as pd
 
-from utils.db_helper import DBHelper
-from utils.constants import COUNTRY_CODE_MAPPING, EXCLUDE_REGIONS, DOWNLOAD_DIR
-from utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, find_unmatched_countries, \
+from crossborder.utils.db_helper import DBHelper
+from crossborder.utils.constants import COUNTRY_CODE_MAPPING, EXCLUDE_REGIONS, DOWNLOAD_DIR
+from crossborder.utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, find_unmatched_countries, \
     extract_year_month_from_path, traverse_and_process
 
 # 常量配置(新增路径正则校验)

+ 3 - 3
crossborder/henan/selenium_henan_download.py

@@ -13,9 +13,9 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
 from henan.henan_parse_excel import parse_excel
-from utils.constants import DOWNLOAD_DIR
-from utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence
-from utils.parse_utils import traverse_and_process
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence
+from crossborder.utils.parse_utils import traverse_and_process
 
 # 基础配置
 

+ 16 - 19
crossborder/jiangsu/crawl_gov_jiangsu_full.py

@@ -1,3 +1,4 @@
+import argparse
 import os
 import random
 import re
@@ -16,13 +17,13 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from jiangsu import download_dir
-from jiangsu import gov_commodity_jiangsu_country
-from jiangsu import gov_commodity_jiangsu_city
-from jiangsu import gov_commodity_jiangsu_import_export
+from crossborder.jiangsu import download_dir
+from crossborder.jiangsu import gov_commodity_jiangsu_country
+from crossborder.jiangsu import gov_commodity_jiangsu_city
+from crossborder.jiangsu import gov_commodity_jiangsu_import_export
 
-from utils import base_country_code, base_mysql
-from utils.log import log
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.log import log
 
 # rarfile.UNRAR_EXECUTABLE = r"C:\Program Files\WinRAR\UnRAR.exe"
 rarfile.UNRAR_EXECUTABLE = "unrar"
@@ -213,7 +214,7 @@ def crawl_with_selenium(url, mark):
     driver = webdriver.Firefox(options=configure_stealth_options())
 
     year_month = None
-    if 'increment' == mark:
+    if 'auto' == mark:
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("江苏省海关没有最新数据更新")
@@ -333,18 +334,14 @@ def hierarchical_traversal(root_path, all_records):
                 gov_commodity_jiangsu_city.process_folder(md['path'])
 
 def main():
-    crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html', 'all')
-    # crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html', 'increment')
-    # log.info(f"江苏南京海关全量数据下载任务完成")
-    # # 等待5s后执行
-    # time.sleep(5)
-    # all_records = base_mysql.get_hs_all()
-    # hierarchical_traversal(base_country_code.download_dir, all_records)
-    # log.info("江苏南京海关类章、国家、城市所有文件处理完成!")
-    # time.sleep(5)
-    # base_mysql.update_january_yoy('江苏省')
-    # base_mysql.update_shandong_yoy('江苏省')
-    # log.info("江苏南京海关城市同比sql处理完成")
+    parser = argparse.ArgumentParser(description="爬取模式: 全量(all) 或 增量(auto)")
+    parser.add_argument("mode", choices=["all", "auto"], help="运行模式")
+    args = parser.parse_args()
+
+    if args.mode == "all":
+        crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
+    else:
+        crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
 
 if __name__ == '__main__':
     main()

+ 4 - 4
crossborder/jiangsu/gov_commodity_jiangsu_city.py

@@ -3,10 +3,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from jiangsu import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.jiangsu import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 city_code_map = {
     "南京市": "320100",

+ 4 - 4
crossborder/jiangsu/gov_commodity_jiangsu_country.py

@@ -2,10 +2,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from jiangsu import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.jiangsu import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 # 排除地区名单
 EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",

+ 3 - 3
crossborder/jiangsu/gov_commodity_jiangsu_import_export.py

@@ -3,9 +3,9 @@ from pathlib import Path
 
 import pandas as pd
 
-from jiangsu import download_dir
-from utils import base_country_code, base_mysql
-from utils.log import log
+from crossborder.jiangsu import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.log import log
 
 YEAR_PATTERN = re.compile(r"^\d{4}$")
 MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")

+ 1 - 1
crossborder/quanguo/CountryTrade.py

@@ -2,7 +2,7 @@ import re
 
 import pandas as pd
 
-from utils.parse_utils import clean_county_name
+from crossborder.utils.parse_utils import clean_county_name
 
 # 配置参数
 EXCEL_PATH = r"D:/Downloads/2025051809262394128.xls"

+ 1 - 1
crossborder/quanguo/CountryTradeYear.py

@@ -5,7 +5,7 @@ import pandas as pd
 import pymysql
 from pymysql import Error
 
-from utils.constants import COUNTRY_CODE_MAPPING
+from crossborder.utils.constants import COUNTRY_CODE_MAPPING
 
 YEAR = 2023
 

+ 1 - 1
crossborder/quanguo/detail.py

@@ -1,6 +1,6 @@
 # ---------------------------- 核心解析逻辑修改 ----------------------------
 from quanguo.CountryTradeYear import COUNTRY_CODE_MAPPING
-from utils.parse_utils import clean_county_name
+from crossborder.utils.parse_utils import clean_county_name
 
 
 def chinese_class_to_number(class_str):

+ 3 - 3
crossborder/quanguo/selenium_download.py

@@ -11,11 +11,11 @@ from selenium.webdriver import FirefoxOptions, ActionChains
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.constants import DOWNLOAD_DIR
-from utils.download_utils import configure_stealth_options, wait_for_download, download_excel
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.download_utils import configure_stealth_options, wait_for_download, download_excel
 from selenium.webdriver.common.by import By
 
-from utils.log import log
+from crossborder.utils.log import log
 
 YEAR = 2025
 TARGET_TABLES = [

+ 5 - 5
crossborder/shandong/selenium_shandong_download.py

@@ -9,12 +9,12 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.db_helper import DBHelper
+from crossborder.utils.db_helper import DBHelper
 from shandong.shandong_parse_excel import parse_excel
-from utils.constants import DOWNLOAD_DIR
-from utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
-from utils.log import log
-from utils.parse_utils import traverse_and_process
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.download_utils import configure_stealth_options, generate_month_sequence, download_excel
+from crossborder.utils.log import log
+from crossborder.utils.parse_utils import traverse_and_process
 
 # 基础配置
 

+ 4 - 4
crossborder/shandong/shandong_parse_excel.py

@@ -4,11 +4,11 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 
-from utils.db_helper import DBHelper
+from crossborder.utils.db_helper import DBHelper
 from quanguo.CountryTrade import COUNTRY_CODE_MAPPING
-from utils.constants import DOWNLOAD_DIR
-from utils.log import log
-from utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, \
+from crossborder.utils.constants import DOWNLOAD_DIR
+from crossborder.utils.log import log
+from crossborder.utils.parse_utils import clean_county_name, clean_commodity_name, convert_wan_to_yuan, \
     extract_year_month_from_path, get_previous_month_dir, find_unmatched_countries, traverse_and_process
 
 # 常量配置

+ 1 - 1
crossborder/utils/base_country_code.py

@@ -4,7 +4,7 @@ from pathlib import Path
 
 import pandas as pd
 
-from utils.log import log
+from crossborder.utils.log import log
 
 YEAR_PATTERN = re.compile(r"^\d{4}$")
 MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")

+ 1 - 1
crossborder/utils/base_mysql.py

@@ -2,7 +2,7 @@ import pymysql
 from sqlalchemy import create_engine, text
 from urllib.parse import quote_plus
 
-from utils.log import log
+from crossborder.utils.log import log
 
 provinces = [
     "北京市", "天津市", "上海市", "重庆市",

+ 2 - 2
crossborder/utils/crawl_gov_commodity.py

@@ -1,7 +1,7 @@
 import pandas as pd
 
-from utils import base_mysql
-from utils.log import log
+from crossborder.utils import base_mysql
+from crossborder.utils.log import log
 
 
 def generate_sql_from_excel(excel_file):

+ 1 - 1
crossborder/utils/db_helper.py

@@ -5,7 +5,7 @@ import logging
 import pymysql
 import pandas as pd
 
-from utils.log import log
+from crossborder.utils.log import log
 
 DB_CONFIG = {
     'host': '10.130.75.149',

+ 1 - 1
crossborder/utils/download_utils.py

@@ -10,7 +10,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from utils.log import log
+from crossborder.utils.log import log
 
 DOWNLOAD_TIMEOUT = 60
 

+ 1 - 1
crossborder/utils/parse_utils.py

@@ -2,7 +2,7 @@ import re
 from decimal import Decimal
 from pathlib import Path
 
-from utils.log import log
+from crossborder.utils.log import log
 
 YEAR_PATTERN = re.compile(r"^\d{4}$")
 MONTH_PATTERN = re.compile(r"^(0[1-9]|1[0-2])$")

+ 17 - 19
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -1,3 +1,4 @@
+import argparse
 import os
 import random
 import re
@@ -15,12 +16,12 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 
-from zhejiang import download_dir
-from zhejiang import gov_commodity_zhejiang_city
-from zhejiang import gov_commodity_zhejiang_country
-from zhejiang import gov_commodity_zhejiang_import_export
-from utils import base_country_code, base_mysql
-from utils.log import log
+from crossborder.zhejiang import download_dir
+from crossborder.zhejiang import gov_commodity_zhejiang_city
+from crossborder.zhejiang import gov_commodity_zhejiang_country
+from crossborder.zhejiang import gov_commodity_zhejiang_import_export
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.log import log
 
 def configure_stealth_options():
     """增强型反检测配置[1,4](@ref)"""
@@ -120,7 +121,7 @@ def process_month_tabs(driver, year, base_url, year_month):
                         retry_count += 1
                         break
                     if tar_month != month_text:
-                        log.info(f"{year}年 {month_text} 月份跳过, increment tar: {year_month}")
+                        log.info(f"{year}年 {month_text} 月份跳过, auto tar: {year_month}")
                         continue
                 a_tag.click()
 
@@ -283,7 +284,7 @@ def crawl_with_selenium(url, mark):
     driver = webdriver.Firefox(options=configure_stealth_options())
 
     year_month = None
-    if 'increment' == mark:
+    if 'auto' == mark:
         res = detect_latest_month(driver, url)
         if res is None:
             log.info("浙江省海关没有最新数据更新")
@@ -380,17 +381,14 @@ def hierarchical_traversal(root_path):
                 gov_commodity_zhejiang_city.process_folder(md['path'])
 
 def main():
-    # crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
-    crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'increment')
-    # log.info(f"浙江杭州海关全量数据下载任务完成")
-    # # 等待5s后执行
-    # time.sleep(5)
-    # hierarchical_traversal(download_dir)
-    # log.info("浙江杭州海关类章、国家、城市所有文件处理完成!")
-    # time.sleep(5)
-    # base_mysql.update_january_yoy('浙江省')
-    # base_mysql.update_shandong_yoy('浙江省')
-    # log.info("浙江杭州海关城市同比sql处理完成")
+    parser = argparse.ArgumentParser(description="爬取模式: 全量(all) 或 增量(auto)")
+    parser.add_argument("mode", choices=["all", "auto"], help="运行模式")
+    args = parser.parse_args()
+
+    if args.mode == "all":
+        crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
+    else:
+        crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html','auto')
 
 if __name__ == '__main__':
     main()

+ 4 - 4
crossborder/zhejiang/gov_commodity_zhejiang_city.py

@@ -3,10 +3,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from zhejiang import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.zhejiang import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 city_code_map = {
     "杭州地区": "330100",

+ 4 - 4
crossborder/zhejiang/gov_commodity_zhejiang_country.py

@@ -2,10 +2,10 @@ from pathlib import Path
 
 import pandas as pd
 
-from zhejiang import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.zhejiang import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 # 排除地区名单
 EXCLUDE_REGIONS = ["亚洲", "非洲", "欧洲", "拉丁美洲", "北美洲", "大洋洲", "南极洲",

+ 4 - 4
crossborder/zhejiang/gov_commodity_zhejiang_import_export.py

@@ -3,10 +3,10 @@ from pathlib import Path
 import re
 import pandas as pd
 
-from zhejiang import download_dir
-from utils import base_country_code, base_mysql
-from utils.base_country_code import format_sql_value
-from utils.log import log
+from crossborder.zhejiang import download_dir
+from crossborder.utils import base_country_code, base_mysql
+from crossborder.utils.base_country_code import format_sql_value
+from crossborder.utils.log import log
 
 CUSTOM_COMMODITY_REPLACEMENTS = {
     '稻谷及大米': '稻谷、大米及大米粉',