preprocessing_fcn.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. from google.cloud import storage, translate, vision
  2. import logging
  3. from google.protobuf import json_format
  4. def asyncDetectDocument(vision_client, gcs_source_uri, gcs_destination_uri, batch_size=20):
  5. """
  6. OCR with PDF/TIFF as source files on GCS
  7. Args:
  8. vision_client:
  9. gcs_source_uri:
  10. gcs_destination_uri:
  11. batch_size: How many pages should be grouped into each json output file.
  12. Returns:
  13. """
  14. doc_title = gcs_source_uri.split('/')[-1].split('.pdf')[0]
  15. # Supported mime_types are: 'application/pdf' and 'image/tiff'
  16. mime_type = 'application/pdf'
  17. # Feature in vision API
  18. feature = vision.types.Feature(
  19. type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
  20. gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
  21. input_config = vision.types.InputConfig(
  22. gcs_source=gcs_source, mime_type=mime_type)
  23. gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
  24. output_config = vision.types.OutputConfig(
  25. gcs_destination=gcs_destination, batch_size=batch_size)
  26. async_request = vision.types.AsyncAnnotateFileRequest(
  27. features=[feature], input_config=input_config,
  28. output_config=output_config)
  29. operation = vision_client.async_batch_annotate_files(
  30. requests=[async_request])
  31. #operation.result(timeout=180)
  32. logging.info('Text extraction from document {} is started.'.format(doc_title))
  33. return operation
  34. def readJsonResult(storage_client, bucket_name, doc_title):
  35. """
  36. Parsing the json files and extract text.
  37. Args:
  38. storage_client:
  39. bucket_name:
  40. doc_title:
  41. Returns:
  42. all_text: str - Containing all text of the document
  43. """
  44. gcs_src_prefix = 'json/' + '{}-'.format(doc_title)
  45. # List objects with the given prefix.
  46. blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,
  47. prefix=gcs_src_prefix))
  48. all_text = ''
  49. for blob in blob_list:
  50. json_string = blob.download_as_string()
  51. response = json_format.Parse(
  52. json_string, vision.types.AnnotateFileResponse())
  53. # The actual response for the first page of the input file.
  54. for response in response.responses:
  55. # first_page_response = response.responses[0]
  56. text_response = response.full_text_annotation.text
  57. all_text += text_response
  58. all_text += ' '
  59. logging.info("Parsing of {} json doc was successful.".format(doc_title))
  60. return all_text
  61. def uploadBlob(storage_client, bucket_name, txt_content, destination_blob_name):
  62. """
  63. Uploads a file to the bucket.
  64. Args:
  65. storage_client:
  66. bucket_name:
  67. txt_content:
  68. destination_blob_name:
  69. Returns:
  70. """
  71. #should be fast enough;
  72. destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
  73. bucket = storage_client.bucket(bucket_name)
  74. blob = bucket.blob(destination_blob_name)
  75. blob.upload_from_string(txt_content)
  76. logging.info("Text uploaded to {}".format(destination_blob_name))
  77. def batch_translate_text(translate_client, project_id,
  78. input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
  79. output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
  80. """
  81. Translates a batch of texts on GCS and stores the result in a GCS location.
  82. Args:
  83. translate_client
  84. project_id:
  85. input_uri:
  86. output_uri:
  87. Returns:
  88. """
  89. # Supported file types: https://cloud.google.com/translate/docs/supported-formats
  90. gcs_source = {"input_uri": input_uri}
  91. input_configs_element = {
  92. "gcs_source": gcs_source,
  93. "mime_type": "text/plain" # Can be "text/plain" or "text/html".
  94. }
  95. gcs_destination = {"output_uri_prefix": output_uri}
  96. output_config = {"gcs_destination": gcs_destination}
  97. # Only us-central1 or global are supported location
  98. parent = translate_client.location_path(project_id, location="us-central1")
  99. # Supported language codes: https://cloud.google.com/translate/docs/language
  100. operation = translate_client.batch_translate_text(
  101. parent=parent,
  102. source_language_code="it",
  103. target_language_codes=["en"], # Up to 10 language codes here.
  104. input_configs=[input_configs_element],
  105. output_config=output_config)
  106. return operation