extraction.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from google.cloud import storage, vision
  2. from google.oauth2 import service_account
  3. from utils.preprocessing_fcn import readJsonResult, uploadBlob, asyncDetectDocument
  4. import logging
  5. logging.getLogger().setLevel(logging.INFO)
  6. import time
  7. import os
  8. project_id = os.getenv('PROJECT_ID')
  9. bucket_name = os.getenv('BUCKET_NAME')
  10. location = os.getenv('LOCATION')
  11. key_path = os.getenv('SA_KEY_PATH')
  12. credentials = service_account.Credentials.from_service_account_file(key_path)
  13. storage_client = storage.Client(credentials=credentials)
  14. vision_client = vision.ImageAnnotatorClient(credentials=credentials)
  15. lst_pdf_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
  16. prefix='pdf')
  17. lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
  18. prefix='json')
  19. start_time = time.time()
  20. operation_list = []
  21. for blob in lst_pdf_blobs:
  22. doc_title = blob.name.split('/')[-1].split('.pdf')[0]
  23. # Generate all paths
  24. gcs_source_path = 'gs://' + bucket_name + '/' + blob.name
  25. json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + doc_title + '-'
  26. # OCR pdf documents
  27. operation = asyncDetectDocument(vision_client,
  28. gcs_source_path,
  29. json_gcs_dest_path)
  30. operation_list.append(operation)
  31. for operation in operation_list:
  32. operation.result()
  33. total_time = time.time() - start_time
  34. logging.info("Vision API successfully completed OCR of all documents on {} minutes".format(round(total_time / 60, 1)))
  35. # Extracting the text now
  36. start_time = time.time()
  37. for blob in lst_json_blobs:
  38. doc_title = blob.name.split('/')[-1].split('-')[0]
  39. # Define GCS paths
  40. # json_gcs_dest_path = 'gs://' + bucket_name + '/{}'.format(blob.name)
  41. txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
  42. # Parse json
  43. all_text = readJsonResult(storage_client=storage_client, bucket_name=bucket_name, doc_title=doc_title)
  44. # Upload raw text to GCS
  45. uploadBlob(storage_client=storage_client, bucket_name=bucket_name,
  46. txt_content=all_text, destination_blob_name=txt_gcs_dest_path)
  47. total_time = time.time() - start_time
  48. logging.info(
  49. 'Successful parsing of all documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))