|
@@ -1,6 +1,6 @@
|
|
from google.cloud import storage, translate
|
|
from google.cloud import storage, translate
|
|
from google.oauth2 import service_account
|
|
from google.oauth2 import service_account
|
|
-from utils.preprocessing_fcn import batch_translate_text, upload_blob
|
|
|
|
|
|
+from utils.preprocessing_fcn import batch_translate_text, uploadBlob
|
|
import logging
|
|
import logging
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
|
|
|
|
@@ -21,6 +21,8 @@ translate_client = translate.TranslationServiceClient(credentials=credentials)
|
|
|
|
|
|
lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
prefix='raw_txt')
|
|
prefix='raw_txt')
|
|
|
|
+lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
|
|
+ prefix='raw_txt')
|
|
|
|
|
|
customize_stop_words = [
|
|
customize_stop_words = [
|
|
'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
|
|
'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
|
|
@@ -35,6 +37,9 @@ customize_stop_words = [
|
|
]
|
|
]
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
|
+# parrallize translate
|
|
|
|
+operation_list = []
|
|
|
|
+
|
|
for blob in lst_raw_txt_blobs:
|
|
for blob in lst_raw_txt_blobs:
|
|
doc_title = blob.name.split('/')[-1].split('.')[0]
|
|
doc_title = blob.name.split('/')[-1].split('.')[0]
|
|
|
|
|
|
@@ -44,19 +49,27 @@ for blob in lst_raw_txt_blobs:
|
|
|
|
|
|
# Translateba raw text to english
|
|
# Translateba raw text to english
|
|
try:
|
|
try:
|
|
- batch_translate_text(translate_client=translate_client,
|
|
|
|
|
|
+ operation = batch_translate_text(translate_client=translate_client,
|
|
project_id=project_id,
|
|
project_id=project_id,
|
|
input_uri=txt_gcs_dest_path,
|
|
input_uri=txt_gcs_dest_path,
|
|
output_uri=eng_txt_gcs_dest_path)
|
|
output_uri=eng_txt_gcs_dest_path)
|
|
- logging.info("Translation of {} document was successful.".format(doc_title))
|
|
|
|
|
|
+ operation_list.append(operation)
|
|
|
|
+ logging.info("Translation of {} document was started.".format(doc_title))
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logging.error("Error", e)
|
|
logging.error("Error", e)
|
|
|
|
+for operation in operation_list:
|
|
|
|
+ operation.result(timeout=180)
|
|
|
|
+total_time = time.time() - start_time
|
|
|
|
+logging.info("Translation is done in {} minutes".format(
|
|
|
|
+ round(total_time / 60, 1)))
|
|
|
|
|
|
|
|
+for blob in lst_raw_txt_blobs_2:
|
|
|
|
+ doc_title = blob.name.split('/')[-1].split('.')[0]
|
|
|
|
+ processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
|
|
# Curate eng raw text
|
|
# Curate eng raw text
|
|
blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
|
|
blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
|
|
bucket_name,
|
|
bucket_name,
|
|
doc_title)
|
|
doc_title)
|
|
-
|
|
|
|
eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
|
|
eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
|
|
eng_raw_string = eng_blob.download_as_string().decode('utf-8')
|
|
eng_raw_string = eng_blob.download_as_string().decode('utf-8')
|
|
|
|
|
|
@@ -82,10 +95,12 @@ for blob in lst_raw_txt_blobs:
|
|
refined_doc += ' {}'.format(word)
|
|
refined_doc += ' {}'.format(word)
|
|
|
|
|
|
# Upload raw text to GCS
|
|
# Upload raw text to GCS
|
|
- upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
|
|
|
|
|
|
+ uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
|
|
destination_blob_name=processed_eng_gcs_dest_path)
|
|
destination_blob_name=processed_eng_gcs_dest_path)
|
|
logging.info("The curation of {} text completed successfully.".format(doc_title))
|
|
logging.info("The curation of {} text completed successfully.".format(doc_title))
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
total_time = time.time() - start_time
|
|
total_time = time.time() - start_time
|
|
logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format(
|
|
logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format(
|
|
round(total_time / 60, 1)))
|
|
round(total_time / 60, 1)))
|