Aziz Ketari 4 роки тому
батько
коміт
42efebd32e

+ 7 - 6
scripts/preprocessing.py

@@ -18,8 +18,8 @@ storage_client = storage.Client(credentials=credentials)
 
 translate_client = translate.TranslationServiceClient(credentials=credentials)
 
-lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
-                                           prefix='json')
+lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
+                                           prefix='raw_txt')
 
 customize_stop_words = [
     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
@@ -33,8 +33,8 @@ customize_stop_words = [
 ]
 
 start_time = time.time()
-for blob in lst_json_blobs:
-    doc_title = blob.name.split('/')[-1].split('-')[0]
+for blob in lst_raw_txt_blobs:
+    doc_title = blob.name.split('/')[-1].split('.')[0]
 
     txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
     eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)
@@ -42,12 +42,13 @@ for blob in lst_json_blobs:
 
     # Translate raw text to english
     try:
-        batch_translate_text(project_id=project_id,
+        batch_translate_text(translate_client=translate_client,
+                             project_id=project_id,
                              location=location,
                              input_uri=txt_gcs_dest_path,
                              output_uri=eng_txt_gcs_dest_path)
         logging.info("Translation of {} document was successful.".format(doc_title))
-    except Exception, e:
+    except Exception as e:
         logging.error("Error", e)
 
     # Process eng raw text

+ 2 - 2
scripts/retrieving.py

@@ -24,14 +24,14 @@ datastore_client = datastore.Client(credentials=credentials)
 try:
     results_lst = returnQueryResults(bq_client, project_id, dataset_name, table_name, case_id)
     logging.info("Here is the result of the test query: \n {}".format(results_lst))
-except Exception, e:
+except Exception as e:
     logging.error("Error", e)
 
 try:
     filter_dict = {'Sign or Symptom':['onset symptoms', "chills"]}
     results = getCases(datastore_client, filter_dict, limit=10)
     logging.info("Here is the result of the test query: \n {}".format(results))
-except Exception, e:
+except Exception as e:
     logging.error("Error", e)
 
 

+ 1 - 1
scripts/utils/bq_fcn.py

@@ -123,6 +123,6 @@ def returnQueryResults(bq_client, project_id, dataset_id, table_id, case_id):
         is_exist = len(list(query_job.result())) >= 1
         logging.info('Query case id: {}'.format(case_id) if is_exist \
                          else "Case id: {} does NOT exist".format(case_id))
-        print (list(query_job.result()))
+        logging.info(list(query_job.result()))
     except Exception as e:
         logging.error("Error", e)

+ 6 - 4
scripts/utils/preprocessing_fcn.py

@@ -1,7 +1,6 @@
 from google.cloud import storage, translate, vision
 #from google.oauth2 import service_account
 import logging
-import os
 
 from google.protobuf import json_format
 
@@ -65,10 +64,11 @@ def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, ba
     logging.info('Text extraction from document {} is completed.'.format(doc_title))
 
 
-def read_json_result(bucket_name, doc_title, storage_client):
+def read_json_result(storage_client, bucket_name, doc_title):
     """
     Parsing the json files and extract text.
     Args:
+        storage_client:
         bucket_name:
         doc_title:
 
@@ -102,6 +102,7 @@ def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name)
     """
     Uploads a file to the bucket.
     Args:
+        storage_client:
         bucket_name:
         txt_content:
         destination_blob_name:
@@ -115,15 +116,16 @@ def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name)
 
     blob.upload_from_string(txt_content)
 
-    print("Text uploaded to {}".format(destination_blob_name))
+    logging.info("Text uploaded to {}".format(destination_blob_name))
 
 
-def batch_translate_text(project_id, location,
+def batch_translate_text(translate_client, project_id, location,
                          input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
                          output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
     """
     Translates a batch of texts on GCS and stores the result in a GCS location.
     Args:
+        translate_client
         project_id:
         location:
         input_uri: