|
@@ -17,6 +17,8 @@ from crossborder.anhui import gov_commodity_anhui_city, download_dir
|
|
|
from crossborder.anhui import gov_commodity_anhui_country
|
|
|
from crossborder.anhui import gov_commodity_anhui_import_export
|
|
|
from crossborder.utils import base_country_code, base_mysql
|
|
|
+from crossborder.utils.base_country_code import get_last_month
|
|
|
+from crossborder.utils.base_mysql import get_commodity_trade_by_prov_year_month
|
|
|
from crossborder.utils.dingtalk import send_dingtalk_message
|
|
|
from crossborder.utils.log import get_logger
|
|
|
|
|
@@ -236,7 +238,7 @@ def crawl_with_selenium(url, mark):
|
|
|
# 获取下一页的URL
|
|
|
next_page_url = next_page_btn.get_attribute("onclick")
|
|
|
if not next_page_url:
|
|
|
- log.info("已到达最后一页,停止爬取")
|
|
|
+ log.info("已到达最后一页,停止采集")
|
|
|
break
|
|
|
# 从onclick属性中提取URL
|
|
|
next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
|
|
@@ -247,7 +249,7 @@ def crawl_with_selenium(url, mark):
|
|
|
# 访问下一页
|
|
|
driver.get(next_page_url)
|
|
|
|
|
|
- log.info(f"开始爬取 {next_page_url} 页面数据")
|
|
|
+ log.info(f"开始采集 {next_page_url} 页面数据")
|
|
|
|
|
|
finally:
|
|
|
driver.quit()
|
|
@@ -322,22 +324,34 @@ def hierarchical_traversal(root_path):
|
|
|
gov_commodity_anhui_city.process_folder(md['path'])
|
|
|
|
|
|
def main():
|
|
|
- parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
|
|
|
- parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
|
|
|
- args = parser.parse_args()
|
|
|
-
|
|
|
- start_time = time.time()
|
|
|
- if args.year == 2023:
|
|
|
- log.info("正在全量爬取安徽省海关数据")
|
|
|
- crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
|
|
|
- duration = time.time() - start_time
|
|
|
- send_dingtalk_message(f'安徽省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
|
|
|
- else:
|
|
|
- log.info("正在增量爬取安徽省海关数据")
|
|
|
- res = crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
|
|
|
- if res == 'finish':
|
|
|
+ try:
|
|
|
+ parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
|
|
|
+ parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ start_time = time.time()
|
|
|
+ if args.year == 2023:
|
|
|
+ log.info("正在全量采集安徽省海关数据")
|
|
|
+ crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
|
|
|
duration = time.time() - start_time
|
|
|
- send_dingtalk_message(f'安徽省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
|
|
|
+ minutes, seconds = divmod(duration, 60)
|
|
|
+ send_dingtalk_message(f'【安徽省海关】全量数据采集完成,耗时 {int(minutes)}分{seconds:.1f}秒')
|
|
|
+ else:
|
|
|
+ log.info("正在增量采集安徽省海关数据")
|
|
|
+ res = crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
|
|
|
+ if res == 'finish':
|
|
|
+ duration = time.time() - start_time
|
|
|
+ minutes, seconds = divmod(duration, 60)
|
|
|
+ send_dingtalk_message(f'【安徽省海关】增量数据采集完成,{int(minutes)}分{seconds:.1f}秒')
|
|
|
+
|
|
|
+ res = get_commodity_trade_by_prov_year_month('安徽省', get_last_month())
|
|
|
+ if res is not None:
|
|
|
+ send_dingtalk_message(f"【安徽省海关】 commodity_trade 查询到 {len(res)} 条记录,文件已生成")
|
|
|
+ else:
|
|
|
+ send_dingtalk_message("【安徽省海关】 未查询到任何记录或发生错误")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|