preprocessing_fcn.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. from google.cloud import storage, translate, vision
  2. #from google.oauth2 import service_account
  3. import logging
  4. import os
  5. from google.protobuf import json_format
  6. # DEVELOPER: change path to key
  7. # project_id = os.getenv('PROJECT_ID')
  8. # bucket_name = os.getenv('BUCKET_NAME')
  9. # location = os.getenv('LOCATION')
  10. # key_path = os.getenv('SA_KEY_PATH')
  11. # DEVELOPER: change path to key
  12. # credentials = service_account.Credentials.from_service_account_file(key_path)
  13. #
  14. # storage_client = storage.Client(credentials=credentials,
  15. # project_id=credentials.project_id)
  16. #
  17. # translate_client = translate.Client(credentials=credentials,
  18. # project_id=credentials.project_id)
  19. #
  20. # vision_client = vision.Client(credentials=credentials,
  21. # project_id=credentials.project_id)
  22. def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
  23. """
  24. OCR with PDF/TIFF as source files on GCS
  25. Args:
  26. vision_client:
  27. gcs_source_uri:
  28. gcs_destination_uri:
  29. batch_size: How many pages should be grouped into each json output file.
  30. Returns:
  31. """
  32. doc_title = gcs_source_uri.split('/')[-1].split('.pdf')[0]
  33. # Supported mime_types are: 'application/pdf' and 'image/tiff'
  34. mime_type = 'application/pdf'
  35. # Feature in vision API
  36. feature = vision.types.Feature(
  37. type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
  38. gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
  39. input_config = vision.types.InputConfig(
  40. gcs_source=gcs_source, mime_type=mime_type)
  41. gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
  42. output_config = vision.types.OutputConfig(
  43. gcs_destination=gcs_destination, batch_size=batch_size)
  44. async_request = vision.types.AsyncAnnotateFileRequest(
  45. features=[feature], input_config=input_config,
  46. output_config=output_config)
  47. operation = vision_client.async_batch_annotate_files(
  48. requests=[async_request])
  49. # print('Waiting for the operation to finish.')
  50. operation.result(timeout=180)
  51. logging.info('Text extraction from document {} is completed.'.format(doc_title))
  52. def read_json_result(bucket_name, doc_title, storage_client):
  53. """
  54. Parsing the json files and extract text.
  55. Args:
  56. bucket_name:
  57. doc_title:
  58. Returns:
  59. all_text: str - Containing all text of the document
  60. """
  61. gcs_src_prefix = 'json/' + '{}-'.format(doc_title)
  62. # List objects with the given prefix.
  63. blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,
  64. prefix=gcs_src_prefix))
  65. all_text = ''
  66. for blob in blob_list:
  67. json_string = blob.download_as_string()
  68. response = json_format.Parse(
  69. json_string, vision.types.AnnotateFileResponse())
  70. # The actual response for the first page of the input file.
  71. for response in response.responses:
  72. # first_page_response = response.responses[0]
  73. text_response = response.full_text_annotation.text
  74. all_text += text_response
  75. all_text += ' '
  76. logging.info("Parsing of {} json doc was successful.".format(doc_title))
  77. return all_text
  78. def upload_blob(storage_client, bucket_name, txt_content, destination_blob_name):
  79. """
  80. Uploads a file to the bucket.
  81. Args:
  82. bucket_name:
  83. txt_content:
  84. destination_blob_name:
  85. Returns:
  86. """
  87. destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
  88. bucket = storage_client.bucket(bucket_name)
  89. blob = bucket.blob(destination_blob_name)
  90. blob.upload_from_string(txt_content)
  91. print("Text uploaded to {}".format(destination_blob_name))
  92. def batch_translate_text(project_id, location,
  93. input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
  94. output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
  95. """
  96. Translates a batch of texts on GCS and stores the result in a GCS location.
  97. Args:
  98. project_id:
  99. location:
  100. input_uri:
  101. output_uri:
  102. Returns:
  103. """
  104. # Supported file types: https://cloud.google.com/translate/docs/supported-formats
  105. gcs_source = {"input_uri": input_uri}
  106. input_configs_element = {
  107. "gcs_source": gcs_source,
  108. "mime_type": "text/plain" # Can be "text/plain" or "text/html".
  109. }
  110. gcs_destination = {"output_uri_prefix": output_uri}
  111. output_config = {"gcs_destination": gcs_destination}
  112. parent = translate_client.location_path(project_id, location)
  113. # Supported language codes: https://cloud.google.com/translate/docs/language
  114. operation = translate_client.batch_translate_text(
  115. parent=parent,
  116. source_language_code="it",
  117. target_language_codes=["en"], # Up to 10 language codes here.
  118. input_configs=[input_configs_element],
  119. output_config=output_config)
  120. response = operation.result(180)