| 
														
															@@ -1,6 +1,6 @@ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from google.cloud import storage, translate 
														 | 
														
														 | 
														
															 from google.cloud import storage, translate 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from google.oauth2 import service_account 
														 | 
														
														 | 
														
															 from google.oauth2 import service_account 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-from utils.preprocessing_fcn import batch_translate_text, upload_blob 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+from utils.preprocessing_fcn import batch_translate_text, uploadBlob 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 import logging 
														 | 
														
														 | 
														
															 import logging 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 logging.getLogger().setLevel(logging.INFO) 
														 | 
														
														 | 
														
															 logging.getLogger().setLevel(logging.INFO) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -21,6 +21,8 @@ translate_client = translate.TranslationServiceClient(credentials=credentials) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
														
														 | 
														
															 lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                                            prefix='raw_txt') 
														 | 
														
														 | 
														
															                                            prefix='raw_txt') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                                           prefix='raw_txt') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 customize_stop_words = [ 
														 | 
														
														 | 
														
															 customize_stop_words = [ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital', 
														 | 
														
														 | 
														
															     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital', 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -35,6 +37,9 @@ customize_stop_words = [ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 ] 
														 | 
														
														 | 
														
															 ] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 start_time = time.time() 
														 | 
														
														 | 
														
															 start_time = time.time() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+# parrallize translate 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+operation_list = [] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 for blob in lst_raw_txt_blobs: 
														 | 
														
														 | 
														
															 for blob in lst_raw_txt_blobs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     doc_title = blob.name.split('/')[-1].split('.')[0] 
														 | 
														
														 | 
														
															     doc_title = blob.name.split('/')[-1].split('.')[0] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -44,19 +49,27 @@ for blob in lst_raw_txt_blobs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # Translateba raw text to english 
														 | 
														
														 | 
														
															     # Translateba raw text to english 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     try: 
														 | 
														
														 | 
														
															     try: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        batch_translate_text(translate_client=translate_client, 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        operation = batch_translate_text(translate_client=translate_client, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                              project_id=project_id, 
														 | 
														
														 | 
														
															                              project_id=project_id, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                              input_uri=txt_gcs_dest_path, 
														 | 
														
														 | 
														
															                              input_uri=txt_gcs_dest_path, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                              output_uri=eng_txt_gcs_dest_path) 
														 | 
														
														 | 
														
															                              output_uri=eng_txt_gcs_dest_path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        logging.info("Translation of {} document was successful.".format(doc_title)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        operation_list.append(operation) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        logging.info("Translation of {} document was started.".format(doc_title)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     except Exception as e: 
														 | 
														
														 | 
														
															     except Exception as e: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         logging.error("Error", e) 
														 | 
														
														 | 
														
															         logging.error("Error", e) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+for operation in operation_list: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    operation.result(timeout=180) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+total_time = time.time() - start_time 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+logging.info("Translation is done in {} minutes".format( 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    round(total_time / 60, 1))) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+for blob in lst_raw_txt_blobs_2: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    doc_title = blob.name.split('/')[-1].split('.')[0] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt' 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # Curate eng raw text 
														 | 
														
														 | 
														
															     # Curate eng raw text 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, 
														 | 
														
														 | 
														
															     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                                                                         bucket_name, 
														 | 
														
														 | 
														
															                                                                         bucket_name, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                                                                         doc_title) 
														 | 
														
														 | 
														
															                                                                         doc_title) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix) 
														 | 
														
														 | 
														
															     eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     eng_raw_string = eng_blob.download_as_string().decode('utf-8') 
														 | 
														
														 | 
														
															     eng_raw_string = eng_blob.download_as_string().decode('utf-8') 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -82,10 +95,12 @@ for blob in lst_raw_txt_blobs: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         refined_doc += ' {}'.format(word) 
														 | 
														
														 | 
														
															         refined_doc += ' {}'.format(word) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # Upload raw text to GCS 
														 | 
														
														 | 
														
															     # Upload raw text to GCS 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc, 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc, 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                 destination_blob_name=processed_eng_gcs_dest_path) 
														 | 
														
														 | 
														
															                 destination_blob_name=processed_eng_gcs_dest_path) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     logging.info("The curation of {} text completed successfully.".format(doc_title)) 
														 | 
														
														 | 
														
															     logging.info("The curation of {} text completed successfully.".format(doc_title)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 total_time = time.time() - start_time 
														 | 
														
														 | 
														
															 total_time = time.time() - start_time 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format( 
														 | 
														
														 | 
														
															 logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format( 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     round(total_time / 60, 1))) 
														 | 
														
														 | 
														
															     round(total_time / 60, 1))) 
														 |