preprocessing.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. from google.cloud import storage, translate
  2. from google.oauth2 import service_account
  3. from utils.preprocessing_fcn import batch_translate_text, uploadBlob
  4. import logging
  5. logging.getLogger().setLevel(logging.INFO)
  6. import re
  7. import time
  8. import os
  9. project_id = os.getenv('PROJECT_ID')
  10. bucket_name = os.getenv('BUCKET_NAME')
  11. location = os.getenv('LOCATION')
  12. key_path = os.getenv('SA_KEY_PATH')
  13. credentials = service_account.Credentials.from_service_account_file(key_path)
  14. storage_client = storage.Client(credentials=credentials)
  15. translate_client = translate.TranslationServiceClient(credentials=credentials)
  16. lst_raw_txt_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
  17. prefix='raw_txt')
  18. lst_raw_txt_blobs_2 = storage_client.list_blobs(bucket_or_name=bucket_name,
  19. prefix='raw_txt')
  20. customize_stop_words = [
  21. 'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
  22. 'Borgheresi', 'Agostini', 'Ottaviani', 'Floridi', 'Giovagnoni', 'di', 'specialization',
  23. 'Polytechnic', 'University', 'marche', 'ANCONA', 'Italy', 'Azienda', 'Ospedali',
  24. 'Riuniti', 'Yorrette', 'Matera', 'Michele', 'Nardella', 'Gerardo', 'Costanzo',
  25. 'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he',
  26. 'â', '€', 's', 'b', 'case', 'Cuoladi', 'l', 'c', 'ra', 'bergamo', 'patelli', 'est', 'asst',
  27. 'dr', 'Dianluigi', 'Svizzero', 'i', 'riccardo', 'Alessandro', 'Spinazzola', 'angelo',
  28. 'maggiore', 'p', 'r', 't', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u',
  29. 'man', 'female', 'D'
  30. ]
  31. start_time = time.time()
  32. # parrallize translate
  33. operation_list = []
  34. for blob in lst_raw_txt_blobs:
  35. doc_title = blob.name.split('/')[-1].split('.')[0]
  36. txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
  37. eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)
  38. processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
  39. # Translateba raw text to english
  40. try:
  41. operation = batch_translate_text(translate_client=translate_client,
  42. project_id=project_id,
  43. input_uri=txt_gcs_dest_path,
  44. output_uri=eng_txt_gcs_dest_path)
  45. operation_list.append(operation)
  46. logging.info("Translation of {} document was started.".format(doc_title))
  47. except Exception as e:
  48. logging.error("Error", e)
  49. for operation in operation_list:
  50. operation.result(timeout=180)
  51. total_time = time.time() - start_time
  52. logging.info("Translation is done in {} minutes".format(
  53. round(total_time / 60, 1)))
  54. for blob in lst_raw_txt_blobs_2:
  55. doc_title = blob.name.split('/')[-1].split('.')[0]
  56. processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
  57. # Curate eng raw text
  58. blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
  59. bucket_name,
  60. doc_title)
  61. eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
  62. eng_raw_string = eng_blob.download_as_string().decode('utf-8')
  63. # Remove dates
  64. # 1 or 2 digit number followed by back slash followed by 1 or 2 digit number ...
  65. pattern_dates = '(\d{1,2})/(\d{1,2})/(\d{4})'
  66. pattern_fig = 'Figure (\d{1,2})'
  67. pattern_image = '^Image .$'
  68. replace = ''
  69. eng_raw_string = re.sub(pattern_dates, replace, eng_raw_string)
  70. eng_raw_string = re.sub(pattern_fig, replace, eng_raw_string)
  71. eng_raw_string = re.sub(pattern_image, replace, eng_raw_string)
  72. # remove punctuation and special characters
  73. eng_raw_string = re.sub('[^A-Za-z0-9]+', ' ', eng_raw_string)
  74. # Remove custom stop words
  75. tokens = [token for token in eng_raw_string.split() if token not in customize_stop_words]
  76. refined_doc = ''
  77. for word in tokens:
  78. refined_doc += ' {}'.format(word)
  79. # Upload raw text to GCS
  80. uploadBlob(storage_client=storage_client, bucket_name=bucket_name, txt_content=refined_doc,
  81. destination_blob_name=processed_eng_gcs_dest_path)
  82. logging.info("The curation of {} text completed successfully.".format(doc_title))
  83. total_time = time.time() - start_time
  84. logging.info('The translation and curation of all documents was successfully completed in {} minutes.'.format(
  85. round(total_time / 60, 1)))