ner_fcn.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. from google.cloud import datastore
  2. from scispacy.umls_linking import UmlsEntityLinker
  3. import logging
  4. import pandas as pd
  5. import re
  6. def importModel(model_name):
  7. """
  8. Selective import of the required model from scispacy. These models are quite heavy, hence this function.
  9. Args:
  10. model_name: str -
  11. Returns:
  12. """
  13. if model_name == 'en_core_sci_sm':
  14. import en_core_sci_sm
  15. elif model_name == 'en_core_sci_lg':
  16. import en_core_sci_lg
  17. elif model_name == 'en_ner_bc5cdr_md':
  18. import en_ner_bc5cdr_md
  19. def loadModel(model):
  20. """
  21. Loading Named Entity Recognition model.
  22. Args:
  23. model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md
  24. Returns:
  25. nlp: loaded model
  26. linker: loaded add-on
  27. """
  28. # Load the model
  29. nlp = model.load()
  30. # Add pipe features to pipeline
  31. linker = UmlsEntityLinker(resolve_abbreviations=True)
  32. nlp.add_pipe(linker)
  33. logging.info("Model and add-ons successfully loaded.")
  34. return nlp, linker
  35. def extractMedEntities(vectorized_doc, linker):
  36. """
  37. Returns UMLS entities contained in a text.
  38. Args:
  39. vectorized_doc:
  40. linker:
  41. Returns:
  42. UMLS_tuis_entity: dict - key: entity and value: TUI code
  43. """
  44. # Pattern for TUI code
  45. pattern = 'T(\d{3})'
  46. UMLS_tuis_entity = {}
  47. entity_dict = {}
  48. for idx in range(len(vectorized_doc.ents)):
  49. entity = vectorized_doc.ents[idx]
  50. entity_dict[entity] = ''
  51. for umls_ent in entity._.umls_ents:
  52. entity_dict[entity] = linker.umls.cui_to_entity[umls_ent[0]]
  53. # RegEx expression if contains TUI code
  54. tui = re.search(pattern, str(entity_dict[entity]))
  55. if tui:
  56. UMLS_tuis_entity[str(entity)] = tui.group()
  57. else:
  58. UMLS_tuis_entity[str(entity)] = None
  59. return UMLS_tuis_entity
  60. def addTask(datastore_client, doc_title, entities_dict):
  61. """
  62. Upload entities to Datastore.
  63. Args:
  64. datastore_client:
  65. doc_title:
  66. entities_dict:
  67. Returns:
  68. Datastore key object.
  69. """
  70. key = datastore_client.key('case', doc_title)
  71. task = datastore.Entity(key=key)
  72. task.update(
  73. entities_dict
  74. )
  75. datastore_client.put(task)
  76. # Then get by key for this entity
  77. logging.info("Uploaded {} to Datastore.".format(doc_title))
  78. return datastore_client.get(key)
  79. def getCases(datastore_client, filter_dict, limit=10):
  80. """
  81. Get results of query with custom filters
  82. Args:
  83. datastore_client: Client object
  84. filter_dict: dict - e.g {parameter_A: [entity_name_A, entity_name_B],
  85. parameter_B: [entitiy_name_C]
  86. }
  87. limit: int - result limits per default 10
  88. Returns:
  89. results: list - query results
  90. """
  91. query = datastore_client.query(kind='case')
  92. for key, values in filter_dict.items():
  93. for value in values:
  94. query.add_filter(key, '=', value)
  95. results = list(query.fetch(limit=limit))
  96. return results
  97. def populateDatastore(datastore_client, storage_client, model_name, src_bucket='aketari-covid19-data-update'):
  98. """
  99. Extract UMLS entities and store them in a No-SQL db: Datastore.
  100. Args:
  101. datastore_client: Storage client instantiation -
  102. storage_client: Storage client instantiation -
  103. model_name: str -
  104. src_bucket: str - contains pdf of the newest files
  105. Returns:
  106. Queriable database
  107. """
  108. lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket)
  109. importModel(model_name)
  110. if model_name == 'en_core_sci_sm':
  111. nlp, linker = loadModel(model=en_core_sci_sm)
  112. elif model_name == 'en_core_sci_lg':
  113. nlp, linker = loadModel(model=en_core_sci_lg)
  114. elif model_name == 'en_ner_bc5cdr_md':
  115. nlp, linker = loadModel(model=en_ner_bc5cdr_md)
  116. else:
  117. return False
  118. for blob in lst_curated_blobs:
  119. doc_title = blob.name.split('/')[-1].split('.pdf')[0]
  120. # download as string
  121. eng_string = blob.download_as_string().decode('utf-8')
  122. # convert to vector
  123. doc = nlp(eng_string)
  124. # Extract medical entities
  125. UMLS_tuis_entity = extractMedEntities(doc, linker)
  126. # Mapping of UMLS entities with reference csv
  127. entities = list(UMLS_tuis_entity.keys())
  128. TUIs = list(UMLS_tuis_entity.values())
  129. df_entities = pd.DataFrame(data={'entity': entities, 'TUIs': TUIs})
  130. df_reference_TUIs = pd.read_csv('./scripts/utils/UMLS_tuis.csv')
  131. df_annotated_text_entities = pd.merge(df_entities, df_reference_TUIs, how='inner', on=['TUIs'])
  132. # Upload entities to datastore
  133. entities_dict = {}
  134. for idx in range(df_annotated_text_entities.shape[0]):
  135. category = df_annotated_text_entities.iloc[idx].values[2]
  136. med_entity = df_annotated_text_entities.iloc[idx].values[0]
  137. # Append to list of entities if the key,value pair already exist
  138. try:
  139. entities_dict[category].append(med_entity)
  140. except:
  141. entities_dict[category] = []
  142. entities_dict[category].append(med_entity)
  143. # API call
  144. key = addTask(datastore_client, doc_title, entities_dict)
  145. logging.info('The upload of {} entities is done.'.format(doc_title))