5 years ago · 530e94e65d
--- a/scripts/preprocessing.py
+++ b/scripts/preprocessing.py
@@ -2,6 +2,7 @@ from google.cloud import storage, translate
 
															 from google.oauth2 import service_account
														
 
															 from utils.preprocessing_fcn import batch_translate_text, upload_blob
														
 
															 import logging
														
 
															+logging.getLogger().setLevel(logging.INFO)
														
 
															 import re
														
 
															 import time
														
@@ -40,16 +41,15 @@ for blob in lst_raw_txt_blobs:
 
															     eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)
														
 
															     processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
														
 
															-    # Translate raw text to english
														
 
															-    try:
														
 
															-        batch_translate_text(translate_client=translate_client,
														
 
															-                             project_id=project_id,
														
 
															-                             location=location,
														
 
															-                             input_uri=txt_gcs_dest_path,
														
 
															-                             output_uri=eng_txt_gcs_dest_path)
														
 
															-        logging.info("Translation of {} document was successful.".format(doc_title))
														
 
															-    except Exception as e:
														
 
															-        logging.error("Error", e)
														
 
															+    # Translateba raw text to english
														
 
															+    #try:
														
 
															+    batch_translate_text(translate_client=translate_client,
														
 
															+                         project_id=project_id,
														
 
															+                         input_uri=txt_gcs_dest_path,
														
 
															+                         output_uri=eng_txt_gcs_dest_path)
														
 
															+    logging.info("Translation of {} document was successful.".format(doc_title))
														
 
															+    # except Exception as e:
														
 
															+    #     logging.error("Error", e)
														
 
															     # Process eng raw text
														
 
															     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
														
@@ -81,7 +81,8 @@ for blob in lst_raw_txt_blobs:
 
															         refined_doc += ' {}'.format(word)
														
 
															     # Upload raw text to GCS
														
 
															-    upload_blob(refined_doc, processed_eng_gcs_dest_path)
														
 
															+    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
														
 
															+                destination_blob_name=processed_eng_gcs_dest_path)
														
 
															     logging.info("The curation of {} text completed successfully.".format(doc_title))
														
 
															 total_time = time.time() - start_time
														
--- a/scripts/retrieving.py
+++ b/scripts/retrieving.py
@@ -4,6 +4,7 @@ from utils.bq_fcn import returnQueryResults
 
															 from utils.ner_fcn import getCases
														
 
															 import logging
														
 
															+logging.getLogger().setLevel(logging.INFO)
														
 
															 import os
														
 
															 project_id = os.getenv('PROJECT_ID')
														
--- a/scripts/storing.py
+++ b/scripts/storing.py
@@ -2,9 +2,25 @@ from google.cloud import storage, bigquery, datastore
 
															 from google.oauth2 import service_account
														
 
															 from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ
														
 
															 from utils.ner_fcn import loadModel, addTask, extractMedEntities
														
 
															-import en_core_sci_lg
														
 
															 import logging
														
 
															+logging.getLogger().setLevel(logging.INFO)
														
 
															+
														
 
															+try:
														
 
															+    import en_core_sci_sm
														
 
															+except:
														
 
															+    logging.warning("404: en_core_sci_sm NOT FOUND. Make sure the model was downloaded and installed.")
														
 
															+
														
 
															+try:
														
 
															+    import en_core_sci_lg
														
 
															+except:
														
 
															+    logging.warning("404: en_core_sci_lg NOT FOUND. Make sure the model was downloaded and installed.")
														
 
															+try:
														
 
															+    import en_ner_bionlp13cg_md
														
 
															+except:
														
 
															+    logging.warning("404: en_ner_bionlp13cg_md NOT FOUND. Make sure the model was downloaded and installed.")
														
 
															+
														
 
															+
														
 
															 import time
														
 
															 import os
														
 
															 import pandas as pd
														
@@ -22,6 +38,8 @@ storage_client = storage.Client(credentials=credentials)
 
															 datastore_client = datastore.Client(credentials=credentials)
														
 
															+bq_client = bigquery.Client(credentials=credentials)
														
 
															+
														
 
															 gcs_source_prefix = 'raw_txt'
														
 
															 lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
														
 
															                                       prefix=gcs_source_prefix)
														
@@ -29,13 +47,13 @@ lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
 
															 start_time = time.time()
														
 
															 try:
														
 
															-    dataset_id = bqCreateDataset(dataset_name)
														
 
															+    dataset_id = bqCreateDataset(bq_client, dataset_name)
														
 
															     logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
														
 
															 except Exception as e:
														
 
															     logging.error("An error occurred.", e)
														
 
															 try:
														
 
															-    table_id = bqCreateTable(dataset_id, table_name)
														
 
															+    table_id = bqCreateTable(bq_client, dataset_id, table_name)
														
 
															     logging.info("The following table {} was successfully created/retrieved.".format(table_name))
														
 
															 except Exception as e:
														
 
															     logging.error("An error occurred.", e)
														
@@ -47,9 +65,7 @@ for blob in lst_blobs:
 
															     it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
														
 
															     # set the GCS path
														
 
															-    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
														
 
															-                                                                              bucket_name,
														
 
															-                                                                              doc_title)
														
 
															+    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title)
														
 
															     eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
														
 
															     # Upload blob of interest
														
@@ -57,7 +73,7 @@ for blob in lst_blobs:
 
															         .get_blob('curated_eng_txt/{}.txt'.format(doc_title))
														
 
															     # populate to BQ dataset
														
 
															-    exportItems2BQ(dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
														
 
															+    exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
														
 
															 total_time = time.time() - start_time
														
 
															 logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1)))
														
@@ -66,7 +82,7 @@ curated_gcs_source_prefix = 'curated_eng_txt'
 
															 lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
														
 
															                                               prefix=curated_gcs_source_prefix)
														
 
															-nlp = loadModel(model=en_core_sci_lg)
														
 
															+nlp = loadModel(model=en_core_sci_sm)
														
 
															 start_time = time.time()
														
 
															 for blob in lst_curated_blobs:
														
--- a/scripts/utils/bq_fcn.py
+++ b/scripts/utils/bq_fcn.py
@@ -1,20 +1,8 @@
 
															 from google.cloud import bigquery
														
 
															-from google.oauth2 import service_account
														
 
															 import logging
														
 
															-import os
														
 
															-# project_id = os.getenv('PROJECT_ID')
														
 
															-# bucket_name = os.getenv('BUCKET_NAME')
														
 
															-# location = os.getenv('LOCATION')
														
 
															-# key_path = os.getenv('SA_KEY_PATH')
														
 
															-#
														
 
															-# credentials = service_account.Credentials.from_service_account_file(key_path)
														
 
															-#
														
 
															-# bq_client = bigquery.Client(credentials=credentials,
														
 
															-#                             project_id=project_id)
														
 
															-
														
 
															-def bqCreateDataset(dataset_name):
														
 
															+def bqCreateDataset(bq_client, dataset_name):
														
 
															     """
														
 
															     Creates a dataset on Google Cloud Platform.
														
 
															     Args:
														
@@ -35,7 +23,7 @@ def bqCreateDataset(dataset_name):
 
															         return dataset.dataset_id
														
 
															-def bqCreateTable(dataset_id, table_name):
														
 
															+def bqCreateTable(bq_client, dataset_id, table_name):
														
 
															     """
														
 
															     Create main table with all cases and the medical text.
														
 
															     Args:
														
@@ -65,7 +53,7 @@ def bqCreateTable(dataset_id, table_name):
 
															         return table.table_id
														
 
															-def exportItems2BQ(dataset_id, table_id, case, it_raw_blob, eng_raw_blob, curated_eng_blob):
														
 
															+def exportItems2BQ(bq_client, dataset_id, table_id, case, it_raw_blob, eng_raw_blob, curated_eng_blob):
														
 
															     """
														
 
															     Export text data to BigQuery.
														
 
															     Args:
														
--- a/scripts/utils/ner_fcn.py
+++ b/scripts/utils/ner_fcn.py
@@ -1,27 +1,13 @@
 
															 from google.cloud import datastore
														
 
															-from google.oauth2 import service_account
														
 
															+
														
 
															 import logging
														
 
															 import re
														
 
															-import os
														
 
															-import en_core_sci_sm, en_core_sci_lg, en_ner_bionlp13cg_md
														
 
															 from scispacy.umls_linking import UmlsEntityLinker
														
 
															 from scispacy.abbreviation import AbbreviationDetector
														
 
															-# DEVELOPER: change path to key
														
 
															-# project_id = os.getenv('PROJECT_ID')
														
 
															-# bucket_name = os.getenv('BUCKET_NAME')
														
 
															-# location = os.getenv('LOCATION')
														
 
															-# key_path = os.getenv('SA_KEY_PATH')
														
 
															-
														
 
															-# credentials = service_account.Credentials.from_service_account_file(key_path)
														
 
															-#
														
 
															-# datastore_client = datastore.Client(credentials=credentials,
														
 
															-#                                     project_id=credentials.project_id)
														
 
															-
														
 
															-
														
 
															-def loadModel(model=en_core_sci_lg):
														
 
															+def loadModel(model):
														
 
															     """
														
 
															     Loading Named Entity Recognition model.
														
 
															     Args:
														
@@ -77,18 +63,18 @@ def extractMedEntities(vectorized_doc):
 
															     return UMLS_tuis_entity
														
 
															-def addTask(client, doc_title, entities_dict):
														
 
															+def addTask(datastore_client, doc_title, entities_dict):
														
 
															     """
														
 
															     Upload entities to Datastore.
														
 
															     Args:
														
 
															-        client:
														
 
															+        datastore_client:
														
 
															         doc_title:
														
 
															         entities_dict:
														
 
															     Returns:
														
 
															         Datastore key object.
														
 
															     """
														
 
															-    key = client.key('case', doc_title)
														
 
															+    key = datastore_client.key('case', doc_title)
														
 
															     task = datastore.Entity(key=key)
														
 
															     task.update(
														
 
															         entities_dict
														
--- a/scripts/utils/preprocessing_fcn.py
+++ b/scripts/utils/preprocessing_fcn.py
@@ -1,27 +1,8 @@
 
															 from google.cloud import storage, translate, vision
														
 
															-#from google.oauth2 import service_account
														
 
															 import logging
														
 
															 from google.protobuf import json_format
														
 
															-# DEVELOPER: change path to key
														
 
															-# project_id = os.getenv('PROJECT_ID')
														
 
															-# bucket_name = os.getenv('BUCKET_NAME')
														
 
															-# location = os.getenv('LOCATION')
														
 
															-# key_path = os.getenv('SA_KEY_PATH')
														
 
															-
														
 
															-# DEVELOPER: change path to key
														
 
															-# credentials = service_account.Credentials.from_service_account_file(key_path)
														
 
															-#
														
 
															-# storage_client = storage.Client(credentials=credentials,
														
 
															-#                                 project_id=credentials.project_id)
														
 
															-#
														
 
															-# translate_client = translate.Client(credentials=credentials,
														
 
															-#                                     project_id=credentials.project_id)
														
 
															-#
														
 
															-# vision_client = vision.Client(credentials=credentials,
														
 
															-#                               project_id=credentials.project_id)
														
 
															-
														
 
															 def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
														
 
															     """
														
@@ -119,7 +100,7 @@ def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name)
 
															     logging.info("Text uploaded to {}".format(destination_blob_name))
														
 
															-def batch_translate_text(translate_client, project_id, location,
														
 
															+def batch_translate_text(translate_client, project_id,
														
 
															                          input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
														
 
															                          output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
														
 
															     """
														
@@ -127,7 +108,6 @@ def batch_translate_text(translate_client, project_id, location,
 
															     Args:
														
 
															         translate_client
														
 
															         project_id:
														
 
															-        location:
														
 
															         input_uri:
														
 
															         output_uri:
														
@@ -144,7 +124,9 @@ def batch_translate_text(translate_client, project_id, location,
 
															     }
														
 
															     gcs_destination = {"output_uri_prefix": output_uri}
														
 
															     output_config = {"gcs_destination": gcs_destination}
														
 
															-    parent = translate_client.location_path(project_id, location)
														
 
															+
														
 
															+    # Only us-central1 or global are supported location
														
 
															+    parent = translate_client.location_path(project_id, location="us-central1")
														
 
															     # Supported language codes: https://cloud.google.com/translate/docs/language
														
 
															     operation = translate_client.batch_translate_text(