| 
														
															@@ -2,9 +2,25 @@ from google.cloud import storage, bigquery, datastore 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from google.oauth2 import service_account 
														 | 
														
														 | 
														
															 from google.oauth2 import service_account 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ 
														 | 
														
														 | 
														
															 from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from utils.ner_fcn import loadModel, addTask, extractMedEntities 
														 | 
														
														 | 
														
															 from utils.ner_fcn import loadModel, addTask, extractMedEntities 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-import en_core_sci_lg 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import logging 
														 | 
														
														 | 
														
															 import logging 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+logging.getLogger().setLevel(logging.INFO) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    import en_core_sci_sm 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+except: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    logging.warning("404: en_core_sci_sm NOT FOUND. Make sure the model was downloaded and installed.") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    import en_core_sci_lg 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+except: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    logging.warning("404: en_core_sci_lg NOT FOUND. Make sure the model was downloaded and installed.") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    import en_ner_bionlp13cg_md 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+except: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    logging.warning("404: en_ner_bionlp13cg_md NOT FOUND. Make sure the model was downloaded and installed.") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import time 
														 | 
														
														 | 
														
															 import time 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import os 
														 | 
														
														 | 
														
															 import os 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import pandas as pd 
														 | 
														
														 | 
														
															 import pandas as pd 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -22,6 +38,8 @@ storage_client = storage.Client(credentials=credentials) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 datastore_client = datastore.Client(credentials=credentials) 
														 | 
														
														 | 
														
															 datastore_client = datastore.Client(credentials=credentials) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+bq_client = bigquery.Client(credentials=credentials) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 gcs_source_prefix = 'raw_txt' 
														 | 
														
														 | 
														
															 gcs_source_prefix = 'raw_txt' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
														
														 | 
														
															 lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                                       prefix=gcs_source_prefix) 
														 | 
														
														 | 
														
															                                       prefix=gcs_source_prefix) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -29,13 +47,13 @@ lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 start_time = time.time() 
														 | 
														
														 | 
														
															 start_time = time.time() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 try: 
														 | 
														
														 | 
														
															 try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    dataset_id = bqCreateDataset(dataset_name) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    dataset_id = bqCreateDataset(bq_client, dataset_name) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name)) 
														 | 
														
														 | 
														
															     logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 except Exception as e: 
														 | 
														
														 | 
														
															 except Exception as e: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     logging.error("An error occurred.", e) 
														 | 
														
														 | 
														
															     logging.error("An error occurred.", e) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 try: 
														 | 
														
														 | 
														
															 try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    table_id = bqCreateTable(dataset_id, table_name) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    table_id = bqCreateTable(bq_client, dataset_id, table_name) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     logging.info("The following table {} was successfully created/retrieved.".format(table_name)) 
														 | 
														
														 | 
														
															     logging.info("The following table {} was successfully created/retrieved.".format(table_name)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 except Exception as e: 
														 | 
														
														 | 
														
															 except Exception as e: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     logging.error("An error occurred.", e) 
														 | 
														
														 | 
														
															     logging.error("An error occurred.", e) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -47,9 +65,7 @@ for blob in lst_blobs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title)) 
														 | 
														
														 | 
														
															     it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # set the GCS path 
														 | 
														
														 | 
														
															     # set the GCS path 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                                                                              bucket_name, 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                                                                              doc_title) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw) 
														 | 
														
														 | 
														
															     eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # Upload blob of interest 
														 | 
														
														 | 
														
															     # Upload blob of interest 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -57,7 +73,7 @@ for blob in lst_blobs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         .get_blob('curated_eng_txt/{}.txt'.format(doc_title)) 
														 | 
														
														 | 
														
															         .get_blob('curated_eng_txt/{}.txt'.format(doc_title)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # populate to BQ dataset 
														 | 
														
														 | 
														
															     # populate to BQ dataset 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    exportItems2BQ(dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 total_time = time.time() - start_time 
														 | 
														
														 | 
														
															 total_time = time.time() - start_time 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1))) 
														 | 
														
														 | 
														
															 logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1))) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -66,7 +82,7 @@ curated_gcs_source_prefix = 'curated_eng_txt' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
														
														 | 
														
															 lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                                               prefix=curated_gcs_source_prefix) 
														 | 
														
														 | 
														
															                                               prefix=curated_gcs_source_prefix) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-nlp = loadModel(model=en_core_sci_lg) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+nlp = loadModel(model=en_core_sci_sm) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 start_time = time.time() 
														 | 
														
														 | 
														
															 start_time = time.time() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 for blob in lst_curated_blobs: 
														 | 
														
														 | 
														
															 for blob in lst_curated_blobs: 
														 |