Bladeren bron

crawl add try

zhangfan 1 maand geleden
bovenliggende
commit
1ecbc866d3

+ 18 - 15
crossborder/anhui/crawl_gov_anhui_full.py

@@ -322,22 +322,25 @@ def hierarchical_traversal(root_path):
                 gov_commodity_anhui_city.process_folder(md['path'])
 
 def main():
-    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
-    parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
-    args = parser.parse_args()
-
-    start_time = time.time()
-    if args.year == 2023:
-        log.info("正在全量采集安徽省海关数据")
-        crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
-        duration = time.time() - start_time
-        send_dingtalk_message(f'【安徽省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
-    else:
-        log.info("正在增量采集安徽省海关数据")
-        res = crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
-        if res == 'finish':
+    try:
+        parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+        parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
+        args = parser.parse_args()
+
+        start_time = time.time()
+        if args.year == 2023:
+            log.info("正在全量采集安徽省海关数据")
+            crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','all')
             duration = time.time() - start_time
-            send_dingtalk_message(f'【安徽省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【安徽省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
+        else:
+            log.info("正在增量采集安徽省海关数据")
+            res = crawl_with_selenium('http://hefei.customs.gov.cn/hefei_customs/zfxxgkzl59/3169584/479584/479585/index.html','auto')
+            if res == 'finish':
+                duration = time.time() - start_time
+                send_dingtalk_message(f'【安徽省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+    except Exception as e:
+        send_dingtalk_message(f'【安徽省海关】发生错误:{e}')
 
 if __name__ == '__main__':
     main()

+ 18 - 15
crossborder/hebei/crawl_gov_hebei_full.py

@@ -300,22 +300,25 @@ def hierarchical_traversal(root_path):
 
 
 def main():
-    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
-    parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
-    args = parser.parse_args()
-
-    start_time = time.time()
-    if args.year == 2023:
-        log.info("正在全量采集河北省海关数据")
-        crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'all')
-        duration = time.time() - start_time
-        send_dingtalk_message(f'【河北省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
-    else:
-        log.info("正在增量采集河北省海关数据")
-        res = crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html','auto')
-        if res == 'finish':
+    try:
+        parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+        parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
+        args = parser.parse_args()
+
+        start_time = time.time()
+        if args.year == 2023:
+            log.info("正在全量采集河北省海关数据")
+            crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html', 'all')
             duration = time.time() - start_time
-            send_dingtalk_message(f'【河北省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【河北省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
+        else:
+            log.info("正在增量采集河北省海关数据")
+            res = crawl_with_selenium('http://shijiazhuang.customs.gov.cn/shijiazhuang_customs/zfxxgk43/2988665/2988681/index.html','auto')
+            if res == 'finish':
+                duration = time.time() - start_time
+                send_dingtalk_message(f'【河北省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+    except Exception as e:
+        send_dingtalk_message(f"【河北省海关】发生错误:{e}")
 
 if __name__ == '__main__':
     main()

+ 18 - 15
crossborder/jiangsu/crawl_gov_jiangsu_full.py

@@ -337,22 +337,25 @@ def hierarchical_traversal(root_path, all_records):
                 gov_commodity_jiangsu_city.process_folder(md['path'])
 
 def main():
-    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
-    parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
-    args = parser.parse_args()
-
-    start_time = time.time()
-    if args.year == 2023:
-        log.info("正在全量采集江苏省海关数据")
-        crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
-        duration = time.time() - start_time
-        send_dingtalk_message(f'【江苏省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
-    else:
-        log.info("正在增量采集江苏省海关数据")
-        res = crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
-        if res == 'finish':
+    try:
+        parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+        parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
+        args = parser.parse_args()
+
+        start_time = time.time()
+        if args.year == 2023:
+            log.info("正在全量采集江苏省海关数据")
+            crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','all')
             duration = time.time() - start_time
-            send_dingtalk_message(f'【江苏省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【江苏省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
+        else:
+            log.info("正在增量采集江苏省海关数据")
+            res = crawl_with_selenium('http://nanjing.customs.gov.cn/nanjing_customs/zfxxgk58/fdzdgknr95/3010051/589289/7e2fcc72-1.html','auto')
+            if res == 'finish':
+                duration = time.time() - start_time
+                send_dingtalk_message(f'【江苏省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+    except Exception as e:
+        send_dingtalk_message(f"【江苏省海关】发生错误:{e}")
 
 if __name__ == '__main__':
     main()

+ 18 - 15
crossborder/zhejiang/crawl_gov_zhejiang_full.py

@@ -384,22 +384,25 @@ def hierarchical_traversal(root_path):
                 gov_commodity_zhejiang_city.process_folder(md['path'])
 
 def main():
-    parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
-    parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
-    args = parser.parse_args()
-
-    start_time = time.time()
-    if args.year == 2023:
-        log.info("正在全量采集浙江省海关数据")
-        crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
-        duration = time.time() - start_time
-        send_dingtalk_message(f'【浙江省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
-    else:
-        log.info("正在增量采集浙江省海关数据")
-        res = crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html','auto')
-        if res == 'finish':
+    try:
+        parser = argparse.ArgumentParser(description='海关数据智能抓取系统')
+        parser.add_argument('--year', type=int, default=None, help='终止年份(如2023),未指定时抓取最新两个月')
+        args = parser.parse_args()
+
+        start_time = time.time()
+        if args.year == 2023:
+            log.info("正在全量采集浙江省海关数据")
+            crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html', 'all')
             duration = time.time() - start_time
-            send_dingtalk_message(f'【浙江省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+            send_dingtalk_message(f'【浙江省海关】全量数据采集完成,耗时 {duration:.2f} 秒')
+        else:
+            log.info("正在增量采集浙江省海关数据")
+            res = crawl_with_selenium('http://hangzhou.customs.gov.cn/hangzhou_customs/575609/zlbd/575612/575612/6430241/6430315/index.html','auto')
+            if res == 'finish':
+                duration = time.time() - start_time
+                send_dingtalk_message(f'【浙江省海关】增量数据采集完成,耗时 {duration:.2f} 秒')
+    except Exception as e:
+        send_dingtalk_message(f"【浙江省海关】发生错误:{e}")
 
 if __name__ == '__main__':
     main()