Bladeren bron

client instantiation fixes

Aziz Ketari 4 jaren geleden
bovenliggende
commit
a09a3c892e
2 gewijzigde bestanden met toevoegingen van 10 en 9 verwijderingen
  1. 6 5
      scripts/extraction.py
  2. 4 4
      scripts/utils/preprocessing_fcn.py

+ 6 - 5
scripts/extraction.py

@@ -31,14 +31,14 @@ for blob in lst_pdf_blobs:
 
     # Generate all paths
     gcs_source_path = 'gs://' + bucket_name + '/' + blob.name
-    json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title
+    json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title + '-'
 
     # OCR pdf documents
     async_detect_document(vision_client,
                           gcs_source_path,
                           json_gcs_dest_path)
 total_time = time.time() - start_time
-logging.info("Vision API successfully completed OCR of all {} documents on {} minutes".format(round(total_time / 60,1)))
+logging.info("Vision API successfully completed OCR of all documents on {} minutes".format(round(total_time / 60,1)))
 
 # Extracting the text now
 start_time = time.time()
@@ -50,11 +50,12 @@ for blob in lst_json_blobs:
     txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
 
     # Parse json
-    all_text = read_json_result(bucket_name=bucket_name, doc_title=doc_title)
+    all_text = read_json_result(storage_client=storage_client, bucket_name=bucket_name, doc_title=doc_title)
 
     # Upload raw text to GCS
-    upload_blob(all_text, txt_gcs_dest_path)
+    upload_blob(storage_client=storage_client, bucket_name=bucket_name,
+                txt_content=all_text, destination_blob_name=txt_gcs_dest_path)
 
 total_time = time.time() - start_time
 logging.info(
-    'Successful parsing of all {} documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))
+    'Successful parsing of all documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))

+ 4 - 4
scripts/utils/preprocessing_fcn.py

@@ -65,7 +65,7 @@ def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, ba
     logging.info('Text extraction from document {} is completed.'.format(doc_title))
 
 
-def read_json_result(bucket_name, doc_title):
+def read_json_result(bucket_name, doc_title, storage_client):
     """
     Parsing the json files and extract text.
     Args:
@@ -75,11 +75,11 @@ def read_json_result(bucket_name, doc_title):
     Returns:
         all_text: str - Containing all text of the document
     """
-    gcs_destination_prefix = 'json/' + '{}-'.format(doc_title)
+    gcs_src_prefix = 'json/' + '{}-'.format(doc_title)
 
     # List objects with the given prefix.
     blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,
-                                               prefix=gcs_destination_prefix))
+                                               prefix=gcs_src_prefix))
     all_text = ''
     for blob in blob_list:
 
@@ -98,7 +98,7 @@ def read_json_result(bucket_name, doc_title):
     return all_text
 
 
-def upload_blob(bucket_name, txt_content, destination_blob_name):
+def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name):
     """
     Uploads a file to the bucket.
     Args: