5 năm trước cách đây · a09a3c892e
--- a/scripts/extraction.py
+++ b/scripts/extraction.py
@@ -31,14 +31,14 @@ for blob in lst_pdf_blobs:
 
				 
			
 
				     # Generate all paths
			
 
				     gcs_source_path = 'gs://' + bucket_name + '/' + blob.name
			
 
				-    json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title
			
 
				+    json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title + '-'
			
 
				 
			
 
				     # OCR pdf documents
			
 
				     async_detect_document(vision_client,
			
 
				                           gcs_source_path,
			
 
				                           json_gcs_dest_path)
			
 
				 total_time = time.time() - start_time
			
 
				-logging.info("Vision API successfully completed OCR of all {} documents on {} minutes".format(round(total_time / 60,1)))
			
 
				+logging.info("Vision API successfully completed OCR of all documents on {} minutes".format(round(total_time / 60,1)))
			
 
				 
			
 
				 # Extracting the text now
			
 
				 start_time = time.time()
			
@@ -50,11 +50,12 @@ for blob in lst_json_blobs:
 
				     txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
			
 
				 
			
 
				     # Parse json
			
 
				-    all_text = read_json_result(bucket_name=bucket_name, doc_title=doc_title)
			
 
				+    all_text = read_json_result(storage_client=storage_client, bucket_name=bucket_name, doc_title=doc_title)
			
 
				 
			
 
				     # Upload raw text to GCS
			
 
				-    upload_blob(all_text, txt_gcs_dest_path)
			
 
				+    upload_blob(storage_client=storage_client, bucket_name=bucket_name,
			
 
				+                txt_content=all_text, destination_blob_name=txt_gcs_dest_path)
			
 
				 
			
 
				 total_time = time.time() - start_time
			
 
				 logging.info(
			
 
				-    'Successful parsing of all {} documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))
			
 
				+    'Successful parsing of all documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))
			
--- a/scripts/utils/preprocessing_fcn.py
+++ b/scripts/utils/preprocessing_fcn.py
@@ -65,7 +65,7 @@ def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, ba
 
				     logging.info('Text extraction from document {} is completed.'.format(doc_title))
			
 
				 
			
 
				 
			
 
				-def read_json_result(bucket_name, doc_title):
			
 
				+def read_json_result(bucket_name, doc_title, storage_client):
			
 
				     """
			
 
				     Parsing the json files and extract text.
			
 
				     Args:
			
@@ -75,11 +75,11 @@ def read_json_result(bucket_name, doc_title):
 
				     Returns:
			
 
				         all_text: str - Containing all text of the document
			
 
				     """
			
 
				-    gcs_destination_prefix = 'json/' + '{}-'.format(doc_title)
			
 
				+    gcs_src_prefix = 'json/' + '{}-'.format(doc_title)
			
 
				 
			
 
				     # List objects with the given prefix.
			
 
				     blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,
			
 
				-                                               prefix=gcs_destination_prefix))
			
 
				+                                               prefix=gcs_src_prefix))
			
 
				     all_text = ''
			
 
				     for blob in blob_list:
			
 
				 
			
@@ -98,7 +98,7 @@ def read_json_result(bucket_name, doc_title):
 
				     return all_text
			
 
				 
			
 
				 
			
 
				-def upload_blob(bucket_name, txt_content, destination_blob_name):
			
 
				+def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name):
			
 
				     """
			
 
				     Uploads a file to the bucket.
			
 
				     Args: