yann 4 rokov pred
rodič
commit
bf67829c19

+ 4 - 4
content/download_content.sh

@@ -1,9 +1,9 @@
 gsutil -m cp -r gs://covid19-public-dataset-aketari/pdf/* gs://$BUCKET_NAME/pdf/
-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
 
-# Installing all supported NER models
+ Installing all supported NER models
 pip install -U ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
 pip install -U ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
 pip install -U ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz

+ 6 - 6
content/env_variables.sh

@@ -1,9 +1,9 @@
-export SA_KEY_PATH="path/to/service_account.json"
-export PROJECT_ID="unique_project_id"
-export BUCKET_NAME="bucket_contains_data"
-export LOCATION="compute_region"
+export SA_KEY_PATH="content/wbss-codelab-6e07b21e6e12.json"
+export PROJECT_ID="wbss-codelab"
+export BUCKET_NAME="cyborg-ai-covid19-data"
+export LOCATION="europe-west1"
 export BQ_DATASET_NAME="covid19"
 export BQ_TABLE_NAME="ISMIR"
 export TEST_CASE="case14" # lowercase any case from 1 to 49 (e.g case1, case32 ...)
-export RESULT_TOPIC="topic_of_choice"
-export DEST_BUCKET="name_bucket" #trigger bucket must be different than data bucket
+export RESULT_TOPIC="covid19"
+export GOOGLE_APPLICATION_CREDENTIALS="content/wbss-codelab-6e07b21e6e12.json"

+ 6 - 2
scripts/extraction.py

@@ -1,6 +1,6 @@
 from google.cloud import storage, vision
 from google.oauth2 import service_account
-from utils.preprocessing_fcn import asyncDetectDocument, readJsonResult, uploadBlob
+from utils.preprocessing_fcn import readJsonResult, uploadBlob, asyncDetectDocument
 
 import logging
 
@@ -27,6 +27,7 @@ lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
                                            prefix='json')
 
 start_time = time.time()
+operation_list = []
 for blob in lst_pdf_blobs:
     doc_title = blob.name.split('/')[-1].split('.pdf')[0]
 
@@ -35,9 +36,12 @@ for blob in lst_pdf_blobs:
     json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title + '-'
 
     # OCR pdf documents
-    asyncDetectDocument(vision_client,
+    operation = asyncDetectDocument(vision_client,
                         gcs_source_path,
                         json_gcs_dest_path)
+    operation_list.append(operation)
+for operation in operation_list:
+    operation.result()
 total_time = time.time() - start_time
 logging.info("Vision API successfully completed OCR of all documents on {} minutes".format(round(total_time / 60, 1)))
 

+ 20 - 5
scripts/preprocessing.py

@@ -1,6 +1,6 @@
 from google.cloud import storage, translate
 from google.oauth2 import service_account
-from utils.preprocessing_fcn import batch_translate_text, upload_blob
+from utils.preprocessing_fcn import batch_translate_text, uploadBlob
 import logging
 logging.getLogger().setLevel(logging.INFO)
 
@@ -21,6 +21,8 @@ translate_client = translate.TranslationServiceClient(credentials=credentials)
 
 lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
                                            prefix='raw_txt')
+lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name,
+                                           prefix='raw_txt')
 
 customize_stop_words = [
     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
@@ -35,6 +37,9 @@ customize_stop_words = [
 ]
 
 start_time = time.time()
+# parrallize translate
+operation_list = []
+
 for blob in lst_raw_txt_blobs:
     doc_title = blob.name.split('/')[-1].split('.')[0]
 
@@ -44,19 +49,27 @@ for blob in lst_raw_txt_blobs:
 
     # Translateba raw text to english
     try:
-        batch_translate_text(translate_client=translate_client,
+        operation = batch_translate_text(translate_client=translate_client,
                              project_id=project_id,
                              input_uri=txt_gcs_dest_path,
                              output_uri=eng_txt_gcs_dest_path)
-        logging.info("Translation of {} document was successful.".format(doc_title))
+        operation_list.append(operation)
+        logging.info("Translation of {} document was started.".format(doc_title))
     except Exception as e:
         logging.error("Error", e)
+for operation in operation_list:
+    operation.result(timeout=180)
+total_time = time.time() - start_time
+logging.info("Translation is done in {} minutes".format(
+    round(total_time / 60, 1)))
 
+for blob in lst_raw_txt_blobs_2:
+    doc_title = blob.name.split('/')[-1].split('.')[0]
+    processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
     # Curate eng raw text
     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
                                                                         bucket_name,
                                                                         doc_title)
-
     eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
     eng_raw_string = eng_blob.download_as_string().decode('utf-8')
 
@@ -82,10 +95,12 @@ for blob in lst_raw_txt_blobs:
         refined_doc += ' {}'.format(word)
 
     # Upload raw text to GCS
-    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
+    uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
                 destination_blob_name=processed_eng_gcs_dest_path)
     logging.info("The curation of {} text completed successfully.".format(doc_title))
 
+
+
 total_time = time.time() - start_time
 logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format(
     round(total_time / 60, 1)))

+ 2 - 3
scripts/storing.py

@@ -37,8 +37,7 @@ if args.store_datastore == 'True' and not args.model_name:
 elif args.store_datastore == 'True' and args.model_name not in model_choices:
     parser.error('--storing in datastore can only be done when --model_name is among the supported models: {}.'.format(model_choices))
 
-
-model_name = args['model_name']
+model_name = args.model_name
 project_id = os.getenv('PROJECT_ID')
 bucket_name = os.getenv('BUCKET_NAME')
 location = os.getenv('LOCATION')
@@ -68,7 +67,7 @@ else:
 if args.store_datastore == 'True':
     start_time = time.time()
     populateDatastore(datastore_client=datastore_client, storage_client=storage_client,
-                      bucket_name=bucket_name, model_name=model_name)
+                      src_bucket=bucket_name, model_name=model_name)
     total_time = time.time() - start_time
     logging.info(
         "The export to Datastore was completed successfully and took {} seconds.".format(round(total_time, 1)))

+ 5 - 4
scripts/utils/bq_fcn.py

@@ -163,9 +163,9 @@ def populateBQ(bq_client, storage_client, bucket_name, dataset_name, table_name)
     except Exception as e:
         logging.error("An error occurred.", e)
 
-    src_bucket = os.environ['SRC_BUCKET']
-    dest_bucket = os.environ['DEST_BUCKET']
-    gcs_source_prefix = 'pdf'
+    src_bucket = os.environ['BUCKET_NAME']
+    dest_bucket =src_bucket
+    gcs_source_prefix = 'raw_txt'
     lst_blobs = storage_client.list_blobs(bucket_or_name=src_bucket,
                                           prefix=gcs_source_prefix)
 
@@ -173,7 +173,8 @@ def populateBQ(bq_client, storage_client, bucket_name, dataset_name, table_name)
         doc_title = blob.name.split('/')[-1].split('.txt')[0]
 
         # download as string
-        it_raw_blob = storage_client.get_bucket(dest_bucket).get_blob('raw_txt/{}.txt'.format(doc_title))
+        current_bucket = storage_client.get_bucket(src_bucket)
+        it_raw_blob = current_bucket.get_blob('raw_txt/{}.txt'.format(doc_title))
 
         # set the GCS path
         try:

+ 5 - 21
scripts/utils/ner_fcn.py

@@ -3,22 +3,8 @@ from scispacy.umls_linking import UmlsEntityLinker
 import logging
 import pandas as pd
 import re
+import en_core_sci_sm, en_core_sci_lg, en_core_sci_lg
 
-def importModel(model_name):
-    """
-    Selective import of the required model from scispacy. These models are quite heavy, hence this function.
-    Args:
-        model_name: str -
-
-    Returns:
-
-    """
-    if model_name == 'en_core_sci_sm':
-        import en_core_sci_sm
-    elif model_name == 'en_core_sci_lg':
-        import en_core_sci_lg
-    elif model_name == 'en_ner_bc5cdr_md':
-        import en_ner_bc5cdr_md
 
 def loadModel(model):
     """
@@ -119,17 +105,14 @@ def populateDatastore(datastore_client, storage_client, model_name, src_bucket='
     """
     Extract UMLS entities and store them in a No-SQL db: Datastore.
     Args:
-        datastore_client: Storage client instantiation -
+        datastore_client: Storage client instantiation -1111
         storage_client: Storage client instantiation -
         model_name: str -
         src_bucket: str - contains pdf of the newest files
     Returns:
         Queriable database
     """
-
-    lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket)
-
-    importModel(model_name)
+    lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket, prefix='curated_eng_txt')
 
     if model_name == 'en_core_sci_sm':
         nlp, linker = loadModel(model=en_core_sci_sm)
@@ -142,9 +125,10 @@ def populateDatastore(datastore_client, storage_client, model_name, src_bucket='
 
     for blob in lst_curated_blobs:
         doc_title = blob.name.split('/')[-1].split('.pdf')[0]
+        logging.info(doc_title)
 
         # download as string
-        eng_string = blob.download_as_string().decode('utf-8')
+        eng_string = blob.download_as_string().decode('utf-8', "ignore")
 
         # convert to vector
         doc = nlp(eng_string)

+ 7 - 5
scripts/utils/preprocessing_fcn.py

@@ -4,7 +4,7 @@ import logging
 from google.protobuf import json_format
 
 
-def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
+def asyncDetectDocument(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
     """
     OCR with PDF/TIFF as source files on GCS
     Args:
@@ -40,9 +40,10 @@ def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, ba
     operation = vision_client.async_batch_annotate_files(
         requests=[async_request])
 
-    # print('Waiting for the operation to finish.')
-    operation.result(timeout=180)
-    logging.info('Text extraction from document {} is completed.'.format(doc_title))
+    #operation.result(timeout=180)
+    logging.info('Text extraction from document {} is started.'.format(doc_title))
+    return operation
+
 
 
 def readJsonResult(storage_client, bucket_name, doc_title):
@@ -91,6 +92,7 @@ def uploadBlob(storage_client, bucket_name, txt_content, destination_blob_name):
     Returns:
 
     """
+    #should be fast enough;
     destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(destination_blob_name)
@@ -136,4 +138,4 @@ def batch_translate_text(translate_client, project_id,
         input_configs=[input_configs_element],
         output_config=output_config)
 
-    response = operation.result(180)
+    return operation