|
@@ -14,6 +14,7 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
|
|
from crossborder.henan.henan_parse_excel import parse_excel
|
|
|
from crossborder.utils.constants import DOWNLOAD_DIR
|
|
|
+from crossborder.utils.db_helper import DBHelper
|
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
|
from crossborder.utils.download_utils import configure_stealth_options, get_previous_month, download_excel, generate_month_sequence
|
|
|
from crossborder.utils.log import get_logger
|
|
@@ -59,7 +60,7 @@ def detect_latest_month(driver):
|
|
|
log.info(f"已找到最新月份数据 {check_year}-{check_month}")
|
|
|
return check_year, check_month
|
|
|
|
|
|
- log.info(f"未找到匹配项(正则:{pattern.pattern})")
|
|
|
+ log.error(f"未找到匹配项(正则:{pattern.pattern})")
|
|
|
except TimeoutException:
|
|
|
log.error(f"页面加载超时或无匹配项({check_year}-{check_month})")
|
|
|
continue
|
|
@@ -199,22 +200,38 @@ def handle_retry(driver):
|
|
|
|
|
|
|
|
|
def main():
|
|
|
- """主入口(优化参数处理逻辑)"""
|
|
|
- global target_months
|
|
|
+ """主入口(优化河南海关数据采集逻辑)"""
|
|
|
parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
|
|
|
parser.add_argument('--year', type=int, default=None,
|
|
|
help='终止年份(如2023),未指定时抓取最新两个月')
|
|
|
args = parser.parse_args()
|
|
|
start_time = time.time()
|
|
|
- driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
|
|
|
+ target_months = [] # 初始化目标月份列表
|
|
|
+ data_collected = False # 数据采集状态标记
|
|
|
+ log.info("【河南海关】数据抓取开始".center(66, "*"))
|
|
|
+
|
|
|
+ # 仅初始化浏览器一次,避免重复创建
|
|
|
+ driver = None
|
|
|
+
|
|
|
try:
|
|
|
- # 智能检测最新有效月份
|
|
|
+ # 1. 初始化浏览器
|
|
|
+ driver = webdriver.Firefox(options=configure_stealth_options(download_dir))
|
|
|
+ log.info("浏览器初始化完成")
|
|
|
+
|
|
|
+ # 2. 检测最新有效月份
|
|
|
valid_year, valid_month = detect_latest_month(driver)
|
|
|
- log.info(f"检测到最新有效数据:{valid_year}年{valid_month:02d}月")
|
|
|
+ log.info(f"【河南海关】最新数据:{valid_year}年{valid_month:02d}月")
|
|
|
|
|
|
- # 生成目标序列
|
|
|
+ # 3. 数据存在性检查(仅在未指定年份时执行)
|
|
|
+ if not args.year:
|
|
|
+ db = DBHelper()
|
|
|
+ count = db.get_code_exist(f'{valid_year}-{valid_month:02d}', "410000")
|
|
|
+ if count > 0:
|
|
|
+ log.error(f"数据库已存在【河南省】 {valid_year}-{valid_month:02d} 商品贸易数据,本次抓取终止")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 4. 生成目标月份序列
|
|
|
if args.year:
|
|
|
- # 指定年份时:从最新月到目标年1月
|
|
|
target_months = generate_month_sequence(
|
|
|
start_year=valid_year,
|
|
|
start_month=valid_month,
|
|
@@ -222,22 +239,44 @@ def main():
|
|
|
skip_january=True
|
|
|
)
|
|
|
else:
|
|
|
- # 未指定年份时:取最近两个月
|
|
|
- target_months = generate_month_sequence(valid_year, valid_month)
|
|
|
+ # 未指定年份时只抓最近两个月份
|
|
|
+ target_months = generate_month_sequence(
|
|
|
+ start_year=valid_year,
|
|
|
+ start_month=valid_month
|
|
|
+ )
|
|
|
+
|
|
|
+ log.info(f"【河南海关】目标采集月份序列:{len(target_months)}个月份")
|
|
|
|
|
|
- log.info(f"目标采集月份序列:{target_months}")
|
|
|
+ # 5. 执行数据采集
|
|
|
reverse_crawler(driver, target_months)
|
|
|
- log.info(f"{len(target_months)}个月份数据已采集完毕")
|
|
|
+ data_collected = True
|
|
|
+ log.info(f"【河南海关】成功采集 {len(target_months)} 个月份数据")
|
|
|
|
|
|
+ # 6. 数据清洗入库
|
|
|
+ log.info("\n【河南海关】数据清洗入库中...")
|
|
|
+ traverse_and_process(download_dir, parse_excel, province_name="henan", year=args.year)
|
|
|
+ log.info("数据清洗入库完成")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ # 捕获并记录所有异常
|
|
|
+ log.exception(f"【河南海关】采集过程中发生错误: {str(e)}")
|
|
|
+ send_dingtalk_message(f"【河南海关数据采集异常】{str(e)}")
|
|
|
|
|
|
finally:
|
|
|
- driver.quit()
|
|
|
- log.info("\n数据清洗入库中...")
|
|
|
- traverse_and_process(download_dir, parse_excel, province_name="henan", year=args.year)
|
|
|
- duration = time.time() - start_time
|
|
|
- minutes, seconds = divmod(duration, 60) # 转换为分钟和秒
|
|
|
- message = f'【河南海关】{len(target_months)}个月份数据已采集完毕,总耗时:{int(minutes)}分{seconds:.1f}秒'
|
|
|
- send_dingtalk_message(message)
|
|
|
+ # 确保浏览器退出
|
|
|
+ if driver:
|
|
|
+ driver.quit()
|
|
|
+ log.info("浏览器已退出")
|
|
|
+
|
|
|
+ # 7. 只有在成功采集数据时才发送通知
|
|
|
+ if data_collected:
|
|
|
+ duration = time.time() - start_time
|
|
|
+ minutes, seconds = divmod(duration, 60)
|
|
|
+ message = (f"【河南海关】{len(target_months)}个月份数据采集完成"
|
|
|
+ f",总耗时:{int(minutes)}分{seconds:.1f}秒")
|
|
|
+ send_dingtalk_message(message)
|
|
|
+
|
|
|
+ log.info("【河南海关】处理流程结束".center(66, "*"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|