CF_translate.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. import base64
  2. import json
  3. import os
  4. import re
  5. import time
  6. import logging
  7. from google.cloud import pubsub_v1, translate, storage
  8. def doTranslation(translate_client, project_id, text, src_lang="it", target_lang="en-US"):
  9. """
  10. Args:
  11. text: str -
  12. src_lang: str - default it
  13. target_lang: str - default en
  14. Returns:
  15. translated_txt: txt - response from translate API
  16. """
  17. logging.info('Translating text into {}.'.format(target_lang))
  18. parent = translate_client.location_path(project_id, location="global")
  19. # Detail on supported types can be found here:
  20. # https://cloud.google.com/translate/docs/supported-formats
  21. translated_dict = translate_client.translate_text(parent=parent,
  22. contents=[text],
  23. mime_type="text/plain",
  24. source_language_code=src_lang,
  25. target_language_code=target_lang)
  26. for translation in translated_dict.translations:
  27. translated_txt = translation.translated_text
  28. return translated_txt
  29. def publishMsg(publisher_client, project_id, text, doc_title, topic_name):
  30. """
  31. Publish message with text and doc_title.
  32. Args:
  33. text: str - Text contained in the document
  34. doc_title: str -
  35. topic_name: str -
  36. Returns:
  37. """
  38. # Compose the message to be sent to pubsub
  39. message = {
  40. 'text': text,
  41. 'doc_title': doc_title,
  42. }
  43. # Note: the message_data needs to be in bytestring
  44. # Refer to the documentation:
  45. # https://googleapis.dev/python/pubsub/latest/publisher/api/client.html
  46. message_data = json.dumps(message).encode('utf-8')
  47. topic_path = publisher_client.topic_path(project_id, topic_name)
  48. # Publish method returns a future instance
  49. future = publisher_client.publish(topic_path, data=message_data)
  50. # We need to call result method to extract the message ID
  51. # Refer to the documentation:
  52. # https://googleapis.dev/python/pubsub/latest/publisher/api/futures.html#google.cloud.pubsub_v1.publisher.futures.Future
  53. message_id = future.result()
  54. logging.info("Message id: {} was published in topic: {}".format(message_id, topic_name))
  55. def uploadBlob(storage_client, bucket_name, txt_content, destination_blob_name):
  56. """
  57. Uploads a file to the bucket.
  58. Args:
  59. storage_client:
  60. bucket_name:
  61. txt_content: str - text
  62. destination_blob_name: str - prefix
  63. Returns:
  64. """
  65. destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
  66. bucket_client = storage_client.bucket(bucket_name)
  67. blob = bucket_client.blob(destination_blob_name)
  68. blob.upload_from_string(txt_content)
  69. logging.info("Text uploaded to {}".format(destination_blob_name))
  70. def cleanEngText(eng_raw_string, customize_stop_words=[]):
  71. """
  72. Args:
  73. eng_raw_string: str -
  74. customize_stop_words: list - all stopwords to remove
  75. Returns:
  76. refined_doc: str - curated string of eng text
  77. """
  78. # Remove dates
  79. # 1 or 2 digit number followed by back slash followed by 1 or 2 digit number ...
  80. pattern_dates = '(\d{1,2})/(\d{1,2})/(\d{4})'
  81. pattern_fig = 'Figure (\d{1,2})'
  82. pattern_image = '^Image .$'
  83. replace = ''
  84. eng_raw_string = re.sub(pattern_dates, replace, eng_raw_string)
  85. eng_raw_string = re.sub(pattern_fig, replace, eng_raw_string)
  86. eng_raw_string = re.sub(pattern_image, replace, eng_raw_string)
  87. # remove punctuation and special characters
  88. eng_raw_string = re.sub("[^A-Za-z0-9]+", ' ', eng_raw_string)
  89. # Remove custom stop words
  90. tokens = [token for token in eng_raw_string.split() if token not in customize_stop_words]
  91. refined_doc = ''
  92. for word in tokens:
  93. refined_doc += ' {}'.format(word)
  94. return refined_doc
  95. def translateAndRefine(event, context):
  96. """
  97. This Cloud Function will be triggered when a message is published on the
  98. PubSub topic of interest. It will call Translate API.
  99. args:
  100. event (dict): Metadata of the event, received from Pub/Sub.
  101. context (google.cloud.functions.Context): Metadata of triggering event.
  102. returns:
  103. None; the output is written to stdout and Stackdriver Logging
  104. """
  105. # INSTANTIATION
  106. translate_client = translate.TranslationServiceClient()
  107. publisher_client = pubsub_v1.PublisherClient()
  108. storage_client = storage.Client()
  109. # SET VARIABLES
  110. project_id = os.environ['GCP_PROJECT']
  111. #RESULT_TOPIC = "it2eng"
  112. start_time = time.time()
  113. if event.get('data'):
  114. message_data = base64.b64decode(event['data']).decode('utf-8')
  115. message = json.loads(message_data)
  116. else:
  117. raise ValueError('Data sector is missing in the Pub/Sub message.')
  118. it_text = message.get('text')
  119. doc_title = message.get('doc_title')
  120. # Step 1: Call Translate API
  121. raw_eng_text = doTranslation(translate_client,project_id, it_text)
  122. # Step 2: Clean eng text
  123. curated_eng_text = cleanEngText(raw_eng_text)
  124. # Step 3: Publish pub/sub
  125. # topic_name = RESULT_TOPIC
  126. # publishMsg(publisher_client, project_id, curated_eng_text, doc_title, topic_name)
  127. # Step 4: Upload translated text
  128. dest_bucket = 'covid19-repo-test'
  129. prefix_raw_eng_txt = 'eng_txt/{}.txt'.format(doc_title)
  130. uploadBlob(storage_client, dest_bucket, raw_eng_text, prefix_raw_eng_txt)
  131. prefix_curated_eng_txt = 'curated_eng_txt/{}.txt'.format(doc_title)
  132. uploadBlob(storage_client, dest_bucket, curated_eng_text, prefix_curated_eng_txt)
  133. end_time = time.time() - start_time
  134. logging.info("Completion of text_extract took: {} seconds".format(round(end_time, 1)))