| 
					
				 | 
			
			
				@@ -1,6 +1,6 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from google.cloud import storage, translate 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from google.oauth2 import service_account 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from utils.preprocessing_fcn import batch_translate_text, upload_blob 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from utils.preprocessing_fcn import batch_translate_text, uploadBlob 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import logging 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 logging.getLogger().setLevel(logging.INFO) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -21,6 +21,8 @@ translate_client = translate.TranslationServiceClient(credentials=credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                                            prefix='raw_txt') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                                           prefix='raw_txt') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 customize_stop_words = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital', 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -35,6 +37,9 @@ customize_stop_words = [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 start_time = time.time() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# parrallize translate 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+operation_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 for blob in lst_raw_txt_blobs: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     doc_title = blob.name.split('/')[-1].split('.')[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -44,19 +49,27 @@ for blob in lst_raw_txt_blobs: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # Translateba raw text to english 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        batch_translate_text(translate_client=translate_client, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        operation = batch_translate_text(translate_client=translate_client, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                              project_id=project_id, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                              input_uri=txt_gcs_dest_path, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                              output_uri=eng_txt_gcs_dest_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        logging.info("Translation of {} document was successful.".format(doc_title)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        operation_list.append(operation) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        logging.info("Translation of {} document was started.".format(doc_title)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         logging.error("Error", e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+for operation in operation_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    operation.result(timeout=180) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+total_time = time.time() - start_time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+logging.info("Translation is done in {} minutes".format( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    round(total_time / 60, 1))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+for blob in lst_raw_txt_blobs_2: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    doc_title = blob.name.split('/')[-1].split('.')[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # Curate eng raw text 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                                                                         bucket_name, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                                                                         doc_title) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     eng_raw_string = eng_blob.download_as_string().decode('utf-8') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -82,10 +95,12 @@ for blob in lst_raw_txt_blobs: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         refined_doc += ' {}'.format(word) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     # Upload raw text to GCS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    upload_blob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 destination_blob_name=processed_eng_gcs_dest_path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     logging.info("The curation of {} text completed successfully.".format(doc_title)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 total_time = time.time() - start_time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     round(total_time / 60, 1))) 
			 |