|
@@ -256,7 +256,7 @@ def crawl_with_selenium(url, mark):
|
|
# 获取下一页的URL
|
|
# 获取下一页的URL
|
|
next_page_url = next_page_btn.get_attribute("onclick")
|
|
next_page_url = next_page_btn.get_attribute("onclick")
|
|
if not next_page_url:
|
|
if not next_page_url:
|
|
- log.info("已到达最后一页,停止爬取")
|
|
|
|
|
|
+ log.info("已到达最后一页,停止采集")
|
|
break
|
|
break
|
|
# 从onclick属性中提取URL
|
|
# 从onclick属性中提取URL
|
|
next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
|
|
next_page_url = re.search(r"'(.*?)'", next_page_url).group(1)
|
|
@@ -267,7 +267,7 @@ def crawl_with_selenium(url, mark):
|
|
# 访问下一页
|
|
# 访问下一页
|
|
driver.get(next_page_url)
|
|
driver.get(next_page_url)
|
|
|
|
|
|
- log.info(f"开始爬取 {next_page_url} 页面数据")
|
|
|
|
|
|
+ log.info(f"开始采集 {next_page_url} 页面数据")
|
|
|
|
|
|
finally:
|
|
finally:
|
|
driver.quit()
|
|
driver.quit()
|
|
@@ -343,16 +343,16 @@ def main():
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
if args.year == 2023:
|
|
if args.year == 2023:
|
|
- log.info("正在全量爬取江苏省海关数据")
|
|
|
|
|
|
+ log.info("正在全量采集江苏省海关数据")
|
|
crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
|
|
crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
|
|
duration = time.time() - start_time
|
|
duration = time.time() - start_time
|
|
- send_dingtalk_message(f'江苏省海关全量数据爬取完成,耗时 {duration:.2f} 秒')
|
|
|
|
|
|
+ send_dingtalk_message(f'【江苏省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
|
|
else:
|
|
else:
|
|
- log.info("正在增量爬取江苏省海关数据")
|
|
|
|
|
|
+ log.info("正在增量采集江苏省海关数据")
|
|
res = crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
|
|
res = crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
|
|
if res == 'finish':
|
|
if res == 'finish':
|
|
duration = time.time() - start_time
|
|
duration = time.time() - start_time
|
|
- send_dingtalk_message(f'江苏省海关增量数据爬取完成,耗时 {duration:.2f} 秒')
|
|
|
|
|
|
+ send_dingtalk_message(f'【江苏省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
main()
|
|
main()
|