vor 4 Jahren · bf67829c19
--- a/content/download_content.sh
+++ b/content/download_content.sh
@@ -1,9 +1,9 @@
 
				 gsutil -m cp -r gs://covid19-public-dataset-aketari/pdf/* gs://$BUCKET_NAME/pdf/
			
 
				-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
			
 
				-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
			
 
				-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
			
 
				+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
			
 
				+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
			
 
				+#gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
			
 
				 
			
 
				-# Installing all supported NER models
			
 
				+ Installing all supported NER models
			
 
				 pip install -U ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
			
 
				 pip install -U ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
			
 
				 pip install -U ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
			
--- a/content/env_variables.sh
+++ b/content/env_variables.sh
@@ -1,9 +1,9 @@
 
				-export SA_KEY_PATH="path/to/service_account.json"
			
 
				-export PROJECT_ID="unique_project_id"
			
 
				-export BUCKET_NAME="bucket_contains_data"
			
 
				-export LOCATION="compute_region"
			
 
				+export SA_KEY_PATH="content/wbss-codelab-6e07b21e6e12.json"
			
 
				+export PROJECT_ID="wbss-codelab"
			
 
				+export BUCKET_NAME="cyborg-ai-covid19-data"
			
 
				+export LOCATION="europe-west1"
			
 
				 export BQ_DATASET_NAME="covid19"
			
 
				 export BQ_TABLE_NAME="ISMIR"
			
 
				 export TEST_CASE="case14" # lowercase any case from 1 to 49 (e.g case1, case32 ...)
			
 
				-export RESULT_TOPIC="topic_of_choice"
			
 
				-export DEST_BUCKET="name_bucket" #trigger bucket must be different than data bucket
			
 
				+export RESULT_TOPIC="covid19"
			
 
				+export GOOGLE_APPLICATION_CREDENTIALS="content/wbss-codelab-6e07b21e6e12.json"
			
--- a/scripts/extraction.py
+++ b/scripts/extraction.py
@@ -1,6 +1,6 @@
 
				 from google.cloud import storage, vision
			
 
				 from google.oauth2 import service_account
			
 
				-from utils.preprocessing_fcn import asyncDetectDocument, readJsonResult, uploadBlob
			
 
				+from utils.preprocessing_fcn import readJsonResult, uploadBlob, asyncDetectDocument
			
 
				 
			
 
				 import logging
			
 
				 
			
@@ -27,6 +27,7 @@ lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
 
				                                            prefix='json')
			
 
				 
			
 
				 start_time = time.time()
			
 
				+operation_list = []
			
 
				 for blob in lst_pdf_blobs:
			
 
				     doc_title = blob.name.split('/')[-1].split('.pdf')[0]
			
 
				 
			
@@ -35,9 +36,12 @@ for blob in lst_pdf_blobs:
 
				     json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title + '-'
			
 
				 
			
 
				     # OCR pdf documents
			
 
				-    asyncDetectDocument(vision_client,
			
 
				+    operation = asyncDetectDocument(vision_client,
			
 
				                         gcs_source_path,
			
 
				                         json_gcs_dest_path)
			
 
				+    operation_list.append(operation)
			
 
				+for operation in operation_list:
			
 
				+    operation.result()
			
 
				 total_time = time.time() - start_time
			
 
				 logging.info("Vision API successfully completed OCR of all documents on {} minutes".format(round(total_time / 60, 1)))
			
 
				 
			
--- a/scripts/preprocessing.py
+++ b/scripts/preprocessing.py
@@ -1,6 +1,6 @@
 
				 from google.cloud import storage, translate
			
 
				 from google.oauth2 import service_account
			
 
				-from utils.preprocessing_fcn import batch_translate_text, upload_blob
			
 
				+from utils.preprocessing_fcn import batch_translate_text, uploadBlob
			
 
				 import logging
			
 
				 logging.getLogger().setLevel(logging.INFO)
			
 
				 
			
@@ -21,6 +21,8 @@ translate_client = translate.TranslationServiceClient(credentials=credentials)
 
				 
			
 
				 lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
			
 
				                                            prefix='raw_txt')
			
 
				+lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name,
			
 
				+                                           prefix='raw_txt')
			
 
				 
			
 
				 customize_stop_words = [
			
 
				     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
			
@@ -35,6 +37,9 @@ customize_stop_words = [
 
				 ]
			
 
				 
			
 
				 start_time = time.time()
			
 
				+# parrallize translate
			
 
				+operation_list = []
			
 
				+
			
 
				 for blob in lst_raw_txt_blobs:
			
 
				     doc_title = blob.name.split('/')[-1].split('.')[0]
			
 
				 
			
@@ -44,19 +49,27 @@ for blob in lst_raw_txt_blobs:
 
				 
			
 
				     # Translateba raw text to english
			
 
				     try:
			
 
				-        batch_translate_text(translate_client=translate_client,
			
 
				+        operation = batch_translate_text(translate_client=translate_client,
			
 
				                              project_id=project_id,
			
 
				                              input_uri=txt_gcs_dest_path,
			
 
				                              output_uri=eng_txt_gcs_dest_path)
			
 
				-        logging.info("Translation of {} document was successful.".format(doc_title))
			
 
				+        operation_list.append(operation)
			
 
				+        logging.info("Translation of {} document was started.".format(doc_title))
			
 
				     except Exception as e:
			
 
				         logging.error("Error", e)
			
 
				+for operation in operation_list:
			
 
				+    operation.result(timeout=180)
			
 
				+total_time = time.time() - start_time
			
 
				+logging.info("Translation is done in {} minutes".format(
			
 
				+    round(total_time / 60, 1)))
			
 
				 
			
 
				+for blob in lst_raw_txt_blobs_2:
			
 
				+    doc_title = blob.name.split('/')[-1].split('.')[0]
			
 
				+    processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
			
 
				     # Curate eng raw text
			
 
				     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
			
 
				                                                                         bucket_name,
			
 
				                                                                         doc_title)
			
 
				-
			
 
				     eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
			
 
				     eng_raw_string = eng_blob.download_as_string().decode('utf-8')
			
 
				 
			
@@ -82,10 +95,12 @@ for blob in lst_raw_txt_blobs:
 
				         refined_doc += ' {}'.format(word)
			
 
				 
			
 
				     # Upload raw text to GCS
			
 
				-    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
			
 
				+    uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
			
 
				                 destination_blob_name=processed_eng_gcs_dest_path)
			
 
				     logging.info("The curation of {} text completed successfully.".format(doc_title))
			
 
				 
			
 
				+
			
 
				+
			
 
				 total_time = time.time() - start_time
			
 
				 logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format(
			
 
				     round(total_time / 60, 1)))
			
--- a/scripts/storing.py
+++ b/scripts/storing.py
@@ -37,8 +37,7 @@ if args.store_datastore == 'True' and not args.model_name:
 
				 elif args.store_datastore == 'True' and args.model_name not in model_choices:
			
 
				     parser.error('--storing in datastore can only be done when --model_name is among the supported models: {}.'.format(model_choices))
			
 
				 
			
 
				-
			
 
				-model_name = args['model_name']
			
 
				+model_name = args.model_name
			
 
				 project_id = os.getenv('PROJECT_ID')
			
 
				 bucket_name = os.getenv('BUCKET_NAME')
			
 
				 location = os.getenv('LOCATION')
			
@@ -68,7 +67,7 @@ else:
 
				 if args.store_datastore == 'True':
			
 
				     start_time = time.time()
			
 
				     populateDatastore(datastore_client=datastore_client, storage_client=storage_client,
			
 
				-                      bucket_name=bucket_name, model_name=model_name)
			
 
				+                      src_bucket=bucket_name, model_name=model_name)
			
 
				     total_time = time.time() - start_time
			
 
				     logging.info(
			
 
				         "The export to Datastore was completed successfully and took {} seconds.".format(round(total_time, 1)))
			
--- a/scripts/utils/bq_fcn.py
+++ b/scripts/utils/bq_fcn.py
@@ -163,9 +163,9 @@ def populateBQ(bq_client, storage_client, bucket_name, dataset_name, table_name)
 
				     except Exception as e:
			
 
				         logging.error("An error occurred.", e)
			
 
				 
			
 
				-    src_bucket = os.environ['SRC_BUCKET']
			
 
				-    dest_bucket = os.environ['DEST_BUCKET']
			
 
				-    gcs_source_prefix = 'pdf'
			
 
				+    src_bucket = os.environ['BUCKET_NAME']
			
 
				+    dest_bucket =src_bucket
			
 
				+    gcs_source_prefix = 'raw_txt'
			
 
				     lst_blobs = storage_client.list_blobs(bucket_or_name=src_bucket,
			
 
				                                           prefix=gcs_source_prefix)
			
 
				 
			
@@ -173,7 +173,8 @@ def populateBQ(bq_client, storage_client, bucket_name, dataset_name, table_name)
 
				         doc_title = blob.name.split('/')[-1].split('.txt')[0]
			
 
				 
			
 
				         # download as string
			
 
				-        it_raw_blob = storage_client.get_bucket(dest_bucket).get_blob('raw_txt/{}.txt'.format(doc_title))
			
 
				+        current_bucket = storage_client.get_bucket(src_bucket)
			
 
				+        it_raw_blob = current_bucket.get_blob('raw_txt/{}.txt'.format(doc_title))
			
 
				 
			
 
				         # set the GCS path
			
 
				         try:
			
--- a/scripts/utils/ner_fcn.py
+++ b/scripts/utils/ner_fcn.py
@@ -3,22 +3,8 @@ from scispacy.umls_linking import UmlsEntityLinker
 
				 import logging
			
 
				 import pandas as pd
			
 
				 import re
			
 
				+import en_core_sci_sm, en_core_sci_lg, en_core_sci_lg
			
 
				 
			
 
				-def importModel(model_name):
			
 
				-    """
			
 
				-    Selective import of the required model from scispacy. These models are quite heavy, hence this function.
			
 
				-    Args:
			
 
				-        model_name: str -
			
 
				-
			
 
				-    Returns:
			
 
				-
			
 
				-    """
			
 
				-    if model_name == 'en_core_sci_sm':
			
 
				-        import en_core_sci_sm
			
 
				-    elif model_name == 'en_core_sci_lg':
			
 
				-        import en_core_sci_lg
			
 
				-    elif model_name == 'en_ner_bc5cdr_md':
			
 
				-        import en_ner_bc5cdr_md
			
 
				 
			
 
				 def loadModel(model):
			
 
				     """
			
@@ -119,17 +105,14 @@ def populateDatastore(datastore_client, storage_client, model_name, src_bucket='
 
				     """
			
 
				     Extract UMLS entities and store them in a No-SQL db: Datastore.
			
 
				     Args:
			
 
				-        datastore_client: Storage client instantiation -
			
 
				+        datastore_client: Storage client instantiation -1111
			
 
				         storage_client: Storage client instantiation -
			
 
				         model_name: str -
			
 
				         src_bucket: str - contains pdf of the newest files
			
 
				     Returns:
			
 
				         Queriable database
			
 
				     """
			
 
				-
			
 
				-    lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket)
			
 
				-
			
 
				-    importModel(model_name)
			
 
				+    lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket, prefix='curated_eng_txt')
			
 
				 
			
 
				     if model_name == 'en_core_sci_sm':
			
 
				         nlp, linker = loadModel(model=en_core_sci_sm)
			
@@ -142,9 +125,10 @@ def populateDatastore(datastore_client, storage_client, model_name, src_bucket='
 
				 
			
 
				     for blob in lst_curated_blobs:
			
 
				         doc_title = blob.name.split('/')[-1].split('.pdf')[0]
			
 
				+        logging.info(doc_title)
			
 
				 
			
 
				         # download as string
			
 
				-        eng_string = blob.download_as_string().decode('utf-8')
			
 
				+        eng_string = blob.download_as_string().decode('utf-8', "ignore")
			
 
				 
			
 
				         # convert to vector
			
 
				         doc = nlp(eng_string)
			
--- a/scripts/utils/preprocessing_fcn.py
+++ b/scripts/utils/preprocessing_fcn.py
@@ -4,7 +4,7 @@ import logging
 
				 from google.protobuf import json_format
			
 
				 
			
 
				 
			
 
				-def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
			
 
				+def asyncDetectDocument(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
			
 
				     """
			
 
				     OCR with PDF/TIFF as source files on GCS
			
 
				     Args:
			
@@ -40,9 +40,10 @@ def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, ba
 
				     operation = vision_client.async_batch_annotate_files(
			
 
				         requests=[async_request])
			
 
				 
			
 
				-    # print('Waiting for the operation to finish.')
			
 
				-    operation.result(timeout=180)
			
 
				-    logging.info('Text extraction from document {} is completed.'.format(doc_title))
			
 
				+    #operation.result(timeout=180)
			
 
				+    logging.info('Text extraction from document {} is started.'.format(doc_title))
			
 
				+    return operation
			
 
				+
			
 
				 
			
 
				 
			
 
				 def readJsonResult(storage_client, bucket_name, doc_title):
			
@@ -91,6 +92,7 @@ def uploadBlob(storage_client, bucket_name, txt_content, destination_blob_name):
 
				     Returns:
			
 
				 
			
 
				     """
			
 
				+    #should be fast enough;
			
 
				     destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
			
 
				     bucket = storage_client.bucket(bucket_name)
			
 
				     blob = bucket.blob(destination_blob_name)
			
@@ -136,4 +138,4 @@ def batch_translate_text(translate_client, project_id,
 
				         input_configs=[input_configs_element],
			
 
				         output_config=output_config)
			
 
				 
			
 
				-    response = operation.result(180)
			
 
				+    return operation