Aziz Ketari 4 жил өмнө
parent
commit
530e94e65d

+ 12 - 11
scripts/preprocessing.py

@@ -2,6 +2,7 @@ from google.cloud import storage, translate
 from google.oauth2 import service_account
 from utils.preprocessing_fcn import batch_translate_text, upload_blob
 import logging
+logging.getLogger().setLevel(logging.INFO)
 
 import re
 import time
@@ -40,16 +41,15 @@ for blob in lst_raw_txt_blobs:
     eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)
     processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
 
-    # Translate raw text to english
-    try:
-        batch_translate_text(translate_client=translate_client,
-                             project_id=project_id,
-                             location=location,
-                             input_uri=txt_gcs_dest_path,
-                             output_uri=eng_txt_gcs_dest_path)
-        logging.info("Translation of {} document was successful.".format(doc_title))
-    except Exception as e:
-        logging.error("Error", e)
+    # Translateba raw text to english
+    #try:
+    batch_translate_text(translate_client=translate_client,
+                         project_id=project_id,
+                         input_uri=txt_gcs_dest_path,
+                         output_uri=eng_txt_gcs_dest_path)
+    logging.info("Translation of {} document was successful.".format(doc_title))
+    # except Exception as e:
+    #     logging.error("Error", e)
 
     # Process eng raw text
     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
@@ -81,7 +81,8 @@ for blob in lst_raw_txt_blobs:
         refined_doc += ' {}'.format(word)
 
     # Upload raw text to GCS
-    upload_blob(refined_doc, processed_eng_gcs_dest_path)
+    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
+                destination_blob_name=processed_eng_gcs_dest_path)
     logging.info("The curation of {} text completed successfully.".format(doc_title))
 
 total_time = time.time() - start_time

+ 1 - 0
scripts/retrieving.py

@@ -4,6 +4,7 @@ from utils.bq_fcn import returnQueryResults
 from utils.ner_fcn import getCases
 
 import logging
+logging.getLogger().setLevel(logging.INFO)
 import os
 
 project_id = os.getenv('PROJECT_ID')

+ 24 - 8
scripts/storing.py

@@ -2,9 +2,25 @@ from google.cloud import storage, bigquery, datastore
 from google.oauth2 import service_account
 from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ
 from utils.ner_fcn import loadModel, addTask, extractMedEntities
-import en_core_sci_lg
 
 import logging
+logging.getLogger().setLevel(logging.INFO)
+
+try:
+    import en_core_sci_sm
+except:
+    logging.warning("404: en_core_sci_sm NOT FOUND. Make sure the model was downloaded and installed.")
+
+try:
+    import en_core_sci_lg
+except:
+    logging.warning("404: en_core_sci_lg NOT FOUND. Make sure the model was downloaded and installed.")
+try:
+    import en_ner_bionlp13cg_md
+except:
+    logging.warning("404: en_ner_bionlp13cg_md NOT FOUND. Make sure the model was downloaded and installed.")
+
+
 import time
 import os
 import pandas as pd
@@ -22,6 +38,8 @@ storage_client = storage.Client(credentials=credentials)
 
 datastore_client = datastore.Client(credentials=credentials)
 
+bq_client = bigquery.Client(credentials=credentials)
+
 gcs_source_prefix = 'raw_txt'
 lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
                                       prefix=gcs_source_prefix)
@@ -29,13 +47,13 @@ lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
 start_time = time.time()
 
 try:
-    dataset_id = bqCreateDataset(dataset_name)
+    dataset_id = bqCreateDataset(bq_client, dataset_name)
     logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
 except Exception as e:
     logging.error("An error occurred.", e)
 
 try:
-    table_id = bqCreateTable(dataset_id, table_name)
+    table_id = bqCreateTable(bq_client, dataset_id, table_name)
     logging.info("The following table {} was successfully created/retrieved.".format(table_name))
 except Exception as e:
     logging.error("An error occurred.", e)
@@ -47,9 +65,7 @@ for blob in lst_blobs:
     it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
 
     # set the GCS path
-    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
-                                                                              bucket_name,
-                                                                              doc_title)
+    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title)
     eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
 
     # Upload blob of interest
@@ -57,7 +73,7 @@ for blob in lst_blobs:
         .get_blob('curated_eng_txt/{}.txt'.format(doc_title))
 
     # populate to BQ dataset
-    exportItems2BQ(dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
+    exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
 
 total_time = time.time() - start_time
 logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1)))
@@ -66,7 +82,7 @@ curated_gcs_source_prefix = 'curated_eng_txt'
 lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
                                               prefix=curated_gcs_source_prefix)
 
-nlp = loadModel(model=en_core_sci_lg)
+nlp = loadModel(model=en_core_sci_sm)
 
 start_time = time.time()
 for blob in lst_curated_blobs:

+ 3 - 15
scripts/utils/bq_fcn.py

@@ -1,20 +1,8 @@
 from google.cloud import bigquery
-from google.oauth2 import service_account
 import logging
-import os
 
-# project_id = os.getenv('PROJECT_ID')
-# bucket_name = os.getenv('BUCKET_NAME')
-# location = os.getenv('LOCATION')
-# key_path = os.getenv('SA_KEY_PATH')
-#
-# credentials = service_account.Credentials.from_service_account_file(key_path)
-#
-# bq_client = bigquery.Client(credentials=credentials,
-#                             project_id=project_id)
 
-
-def bqCreateDataset(dataset_name):
+def bqCreateDataset(bq_client, dataset_name):
     """
     Creates a dataset on Google Cloud Platform.
     Args:
@@ -35,7 +23,7 @@ def bqCreateDataset(dataset_name):
         return dataset.dataset_id
 
 
-def bqCreateTable(dataset_id, table_name):
+def bqCreateTable(bq_client, dataset_id, table_name):
     """
     Create main table with all cases and the medical text.
     Args:
@@ -65,7 +53,7 @@ def bqCreateTable(dataset_id, table_name):
         return table.table_id
 
 
-def exportItems2BQ(dataset_id, table_id, case, it_raw_blob, eng_raw_blob, curated_eng_blob):
+def exportItems2BQ(bq_client, dataset_id, table_id, case, it_raw_blob, eng_raw_blob, curated_eng_blob):
     """
     Export text data to BigQuery.
     Args:

+ 5 - 19
scripts/utils/ner_fcn.py

@@ -1,27 +1,13 @@
 from google.cloud import datastore
-from google.oauth2 import service_account
+
 import logging
 import re
-import os
 
-import en_core_sci_sm, en_core_sci_lg, en_ner_bionlp13cg_md
 from scispacy.umls_linking import UmlsEntityLinker
 from scispacy.abbreviation import AbbreviationDetector
 
 
-# DEVELOPER: change path to key
-# project_id = os.getenv('PROJECT_ID')
-# bucket_name = os.getenv('BUCKET_NAME')
-# location = os.getenv('LOCATION')
-# key_path = os.getenv('SA_KEY_PATH')
-
-# credentials = service_account.Credentials.from_service_account_file(key_path)
-#
-# datastore_client = datastore.Client(credentials=credentials,
-#                                     project_id=credentials.project_id)
-
-
-def loadModel(model=en_core_sci_lg):
+def loadModel(model):
     """
     Loading Named Entity Recognition model.
     Args:
@@ -77,18 +63,18 @@ def extractMedEntities(vectorized_doc):
     return UMLS_tuis_entity
 
 
-def addTask(client, doc_title, entities_dict):
+def addTask(datastore_client, doc_title, entities_dict):
     """
     Upload entities to Datastore.
     Args:
-        client:
+        datastore_client:
         doc_title:
         entities_dict:
 
     Returns:
         Datastore key object.
     """
-    key = client.key('case', doc_title)
+    key = datastore_client.key('case', doc_title)
     task = datastore.Entity(key=key)
     task.update(
         entities_dict

+ 4 - 22
scripts/utils/preprocessing_fcn.py

@@ -1,27 +1,8 @@
 from google.cloud import storage, translate, vision
-#from google.oauth2 import service_account
 import logging
 
 from google.protobuf import json_format
 
-# DEVELOPER: change path to key
-# project_id = os.getenv('PROJECT_ID')
-# bucket_name = os.getenv('BUCKET_NAME')
-# location = os.getenv('LOCATION')
-# key_path = os.getenv('SA_KEY_PATH')
-
-# DEVELOPER: change path to key
-# credentials = service_account.Credentials.from_service_account_file(key_path)
-#
-# storage_client = storage.Client(credentials=credentials,
-#                                 project_id=credentials.project_id)
-#
-# translate_client = translate.Client(credentials=credentials,
-#                                     project_id=credentials.project_id)
-#
-# vision_client = vision.Client(credentials=credentials,
-#                               project_id=credentials.project_id)
-
 
 def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
     """
@@ -119,7 +100,7 @@ def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name)
     logging.info("Text uploaded to {}".format(destination_blob_name))
 
 
-def batch_translate_text(translate_client, project_id, location,
+def batch_translate_text(translate_client, project_id,
                          input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
                          output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
     """
@@ -127,7 +108,6 @@ def batch_translate_text(translate_client, project_id, location,
     Args:
         translate_client
         project_id:
-        location:
         input_uri:
         output_uri:
 
@@ -144,7 +124,9 @@ def batch_translate_text(translate_client, project_id, location,
     }
     gcs_destination = {"output_uri_prefix": output_uri}
     output_config = {"gcs_destination": gcs_destination}
-    parent = translate_client.location_path(project_id, location)
+
+    # Only us-central1 or global are supported location
+    parent = translate_client.location_path(project_id, location="us-central1")
 
     # Supported language codes: https://cloud.google.com/translate/docs/language
     operation = translate_client.batch_translate_text(