extraction.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from google.cloud import storage, vision
  2. from google.oauth2 import service_account
  3. from utils.preprocessing_fcn import async_detect_document, read_json_result, upload_blob
  4. import logging
  5. import time
  6. import os
  7. project_id = os.getenv('PROJECT_ID')
  8. bucket_name = os.getenv('BUCKET_NAME')
  9. location = os.getenv('LOCATION')
  10. key_path = os.getenv('SA_KEY_PATH')
  11. credentials = service_account.Credentials.from_service_account_file(key_path)
  12. storage_client = storage.Client(credentials=credentials,
  13. project_id=project_id)
  14. vision_client = vision.Client(credentials=credentials,
  15. project_id=project_id)
  16. lst_pdf_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
  17. prefix='pdf')
  18. lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
  19. prefix='json')
  20. start_time = time.time()
  21. nbr_documents = len(lst_pdf_blobs)
  22. for blob in lst_pdf_blobs:
  23. doc_title = blob.name.split('/')[-1].split('.pdf')[0]
  24. # Generate all paths
  25. gcs_source_path = 'gs://' + bucket_name + '/' + blob.name
  26. json_gcs_dest_path = 'gs://' + bucket_name + '/json/' + blob.name
  27. # OCR pdf documents
  28. async_detect_document(vision_client,
  29. gcs_source_path,
  30. json_gcs_dest_path)
  31. total_time = time.time() - start_time
  32. logging.info("Vision API successfully completed OCR of all {} documents on {} minutes".format(nbr_documents,
  33. round(total_time / 60,
  34. 1)))
  35. start_time = time.time()
  36. for blob in lst_json_blobs:
  37. doc_title = blob.name.split('/')[-1].split('-')[0]
  38. # Define GCS paths
  39. json_gcs_dest_path = 'gs://' + bucket_name + '/{}'.format(blob.name)
  40. txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
  41. # Parse json
  42. all_text = read_json_result(json_gcs_dest_path, doc_title)
  43. # Upload raw text to GCS
  44. upload_blob(all_text, txt_gcs_dest_path)
  45. total_time = time.time() - start_time
  46. logging.info(
  47. 'Successful parsing of all {} documents resulting from Vision API on {} minutes'.format(round(total_time / 60, 1)))