|
@@ -1,6 +1,7 @@
|
|
import argparse
|
|
import argparse
|
|
import random
|
|
import random
|
|
import re
|
|
import re
|
|
|
|
+import sys
|
|
import time
|
|
import time
|
|
from datetime import datetime
|
|
from datetime import datetime
|
|
|
|
|
|
@@ -13,6 +14,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import
|
|
from crossborder.quanguo.data_cleaning_to_db import perform_data_cleanup_and_import
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
|
|
+from crossborder.utils.db_helper import DBHelper
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
from crossborder.utils.download_utils import configure_stealth_options, download_excel
|
|
from crossborder.utils.download_utils import configure_stealth_options, download_excel
|
|
from crossborder.utils.log import get_logger
|
|
from crossborder.utils.log import get_logger
|
|
@@ -116,7 +118,7 @@ def go_to_year_page(driver, year):
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
-def crawl_with_selenium(driver, year, latest_only=False):
|
|
|
|
|
|
+def crawl_with_selenium(driver, year, latest_only=False,data_collected=False):
|
|
"""主抓取函数"""
|
|
"""主抓取函数"""
|
|
if year < datetime.now().year:
|
|
if year < datetime.now().year:
|
|
if not go_to_year_page(driver, year):
|
|
if not go_to_year_page(driver, year):
|
|
@@ -140,8 +142,21 @@ def crawl_with_selenium(driver, year, latest_only=False):
|
|
|
|
|
|
table_title = generate_table_title(year)
|
|
table_title = generate_table_title(year)
|
|
|
|
|
|
|
|
+ if latest_only:
|
|
|
|
+ if month_links:
|
|
|
|
+ # 只取第一个月份(最新月份)
|
|
|
|
+ month_links = [month_links[0]]
|
|
|
|
+ new_month = month_links[0][0]
|
|
|
|
+ log.info(f"【{table_name}】处理最新月份:{new_month}月")
|
|
|
|
+ db = DBHelper()
|
|
|
|
+ count = db.get_total_info_exist(f'{year}-{new_month:02d}')
|
|
|
|
+ if count > 0:
|
|
|
|
+ log.error(f"数据库已存在【海关总署】 {year}-{new_month:02d} 收发件人数据,本次抓取终止")
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ data_collected = True
|
|
if result and result[0] in table_title:
|
|
if result and result[0] in table_title:
|
|
- handle_month_data(driver, sanitize_filename(table_name), month_links, year=year, latest_only=latest_only)
|
|
|
|
|
|
+ handle_month_data(driver, sanitize_filename(table_name), month_links, year=year)
|
|
|
|
|
|
driver.execute_script("arguments[0].remove()", row)
|
|
driver.execute_script("arguments[0].remove()", row)
|
|
WebDriverWait(driver, 10).until(EC.staleness_of(row))
|
|
WebDriverWait(driver, 10).until(EC.staleness_of(row))
|
|
@@ -159,9 +174,10 @@ def sanitize_filename(filename):
|
|
return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '-', filename)
|
|
return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '-', filename)
|
|
|
|
|
|
|
|
|
|
-def handle_month_data(driver, table_name, month_links, year, latest_only):
|
|
|
|
|
|
+def handle_month_data(driver, table_name, month_links, year):
|
|
global downloaded_tables
|
|
global downloaded_tables
|
|
main_window = driver.current_window_handle
|
|
main_window = driver.current_window_handle
|
|
|
|
+
|
|
for idx, month_data in enumerate(month_links):
|
|
for idx, month_data in enumerate(month_links):
|
|
if 1 <= month_data[0] <= 12:
|
|
if 1 <= month_data[0] <= 12:
|
|
# 年度表月度表只下载一次(最新月份数据)
|
|
# 年度表月度表只下载一次(最新月份数据)
|
|
@@ -181,39 +197,105 @@ def handle_month_data(driver, table_name, month_links, year, latest_only):
|
|
except Exception as e:
|
|
except Exception as e:
|
|
log.info(f"【异常】下载失败: {str(e)}")
|
|
log.info(f"【异常】下载失败: {str(e)}")
|
|
time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
|
|
time.sleep(random.uniform(0.5, 1.5)) # 下载间隔
|
|
|
|
+def main():
|
|
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
|
|
+ global current_year, start_year
|
|
parser = argparse.ArgumentParser(description="抓取海关总署年度数据")
|
|
parser = argparse.ArgumentParser(description="抓取海关总署年度数据")
|
|
parser.add_argument("--year", type=int, help="起始年份,例如:--year 2023")
|
|
parser.add_argument("--year", type=int, help="起始年份,例如:--year 2023")
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
|
+ years_processed = [] # 记录成功处理的年份
|
|
|
|
+ data_collected = False # 是否有数据被采集
|
|
|
|
+ driver = None
|
|
|
|
|
|
- current_year = datetime.now().year
|
|
|
|
- start_year = args.year if args.year else current_year
|
|
|
|
- years_to_crawl = list(range(start_year, current_year + 1))
|
|
|
|
- years_to_crawl.reverse()
|
|
|
|
|
|
+ try:
|
|
|
|
+ # 1. 确定采集年份范围
|
|
|
|
+ current_year = datetime.now().year
|
|
|
|
+ start_year = args.year if args.year else current_year
|
|
|
|
|
|
|
|
+ # 确保年份有效
|
|
|
|
+ if start_year > current_year:
|
|
|
|
+ log.error(f"起始年份 {start_year} 不能大于当前年份 {current_year}")
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
|
- log.info(f"即将抓取 {start_year} - {current_year} 年度数据")
|
|
|
|
- options = configure_stealth_options(download_dir)
|
|
|
|
- driver = webdriver.Firefox(options=options)
|
|
|
|
|
|
+ # 生成年份序列(从新到旧)
|
|
|
|
+ years_to_crawl = list(range(start_year, current_year + 1))
|
|
|
|
+ years_to_crawl.reverse()
|
|
|
|
|
|
- base_url = "http://www.customs.gov.cn/customs/302249/zfxxgk/2799825/302274/302277/6348926/index.html"
|
|
|
|
- driver.get(base_url)
|
|
|
|
- try:
|
|
|
|
|
|
+ log.info(f"【海关总署】开始抓取 {start_year}-{current_year} 年度数据".center(66, "*"))
|
|
|
|
+
|
|
|
|
+ # 2. 初始化浏览器
|
|
|
|
+ options = configure_stealth_options(download_dir)
|
|
|
|
+ driver = webdriver.Firefox(options=options)
|
|
|
|
+ log.info("浏览器初始化完成")
|
|
|
|
+
|
|
|
|
+ # 3. 访问基础页面
|
|
|
|
+ driver.get(base_url)
|
|
|
|
+
|
|
|
|
+ # 4. 年份遍历采集
|
|
for year in years_to_crawl:
|
|
for year in years_to_crawl:
|
|
- log.info(f"\n【{year}年】开始抓取...".center(66, "-"))
|
|
|
|
- crawl_with_selenium(driver, year=year, latest_only=args.year is None)
|
|
|
|
|
|
+ try:
|
|
|
|
+ log.info(f"\n【{year}年】开始处理".center(66, "-"))
|
|
|
|
+
|
|
|
|
+ is_latest_only = (not args.year) and (year == current_year)
|
|
|
|
+
|
|
|
|
+ # 执行年份采集
|
|
|
|
+ crawl_with_selenium(driver, year=year, latest_only=is_latest_only,data_collected = data_collected)
|
|
|
|
+ years_processed.append(year)
|
|
|
|
+
|
|
|
|
+ log.info(f"【{year}年】处理完成".center(66, "-"))
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log.exception(f"⚠️ {year}年数据采集异常: {str(e)}")
|
|
|
|
+ send_dingtalk_message(f"【海关总署{year}年采集异常】{str(e)}")
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log.exception(f"‼️ 海关总署采集全局错误: {str(e)}")
|
|
|
|
+ send_dingtalk_message(f"【海关总署全局异常】{str(e)}")
|
|
|
|
+
|
|
finally:
|
|
finally:
|
|
- driver.quit()
|
|
|
|
- log.info("【海关总署】全年数据抓取结束".center(66, "*"))
|
|
|
|
- log.info("\n数据清洗入库中...")
|
|
|
|
- perform_data_cleanup_and_import(current_year)
|
|
|
|
- log.info("\n数据清洗入库完毕...")
|
|
|
|
|
|
+ # 5. 保证浏览器退出
|
|
|
|
+ if 'driver' in locals():
|
|
|
|
+ driver.quit()
|
|
|
|
+ log.info("浏览器已退出")
|
|
|
|
+
|
|
|
|
+ # 6. 数据清洗入库(仅当有数据被采集时)
|
|
|
|
+ if data_collected:
|
|
|
|
+ log.info("\n【海关总署】数据清洗入库开始".center(66, "*"))
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ log.info("数据清洗入库中...")
|
|
|
|
+ perform_data_cleanup_and_import(years_processed)
|
|
|
|
+ log.info("数据清洗入库完毕")
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ log.exception(f"数据清洗入库异常: {str(e)}")
|
|
|
|
+ send_dingtalk_message(f"【海关总署数据清洗异常】{str(e)}")
|
|
|
|
+
|
|
|
|
+ # 7. 生成报告并发送通知
|
|
duration = time.time() - start_time
|
|
duration = time.time() - start_time
|
|
- minutes, seconds = divmod(duration, 60) # 转换为分钟和秒
|
|
|
|
- message = f'【海关总署】{start_year}年-{current_year}年数据已采集完毕,总耗时:{int(minutes)}分{seconds:.1f}秒'
|
|
|
|
- send_dingtalk_message(message)
|
|
|
|
|
|
+ minutes, seconds = divmod(duration, 60)
|
|
|
|
+
|
|
|
|
+ # 准备通知消息
|
|
|
|
+ year_range = f"{start_year}年-{current_year}年"
|
|
|
|
+ total_years = len(years_processed) if data_collected else 0
|
|
|
|
+
|
|
|
|
+ if data_collected:
|
|
|
|
+ # 成功采集通知
|
|
|
|
+ if years_processed:
|
|
|
|
+ processed_years = "、".join(map(str, years_processed))
|
|
|
|
+ message = (
|
|
|
|
+ f"【海关总署数据采集完成】\n"
|
|
|
|
+ f"• 目标年份: {year_range}\n"
|
|
|
|
+ f"• 实际处理年份: {processed_years}\n"
|
|
|
|
+ f"• 处理年份数量: {len(years_processed)}个\n"
|
|
|
|
+ f"• 总耗时: {int(minutes)}分{seconds:.1f}秒"
|
|
|
|
+ )
|
|
|
|
+ send_dingtalk_message(message)
|
|
|
|
+ log.info("【海关总署】数据采集任务结束".center(66, "*"))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ main()
|