ner_fcn.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. from google.cloud import datastore
  2. from scispacy.umls_linking import UmlsEntityLinker
  3. import logging
  4. import pandas as pd
  5. import re
  6. import en_core_sci_sm, en_core_sci_lg, en_core_sci_lg
  7. def loadModel(model):
  8. """
  9. Loading Named Entity Recognition model.
  10. Args:
  11. model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md
  12. Returns:
  13. nlp: loaded model
  14. linker: loaded add-on
  15. """
  16. # Load the model
  17. nlp = model.load()
  18. # Add pipe features to pipeline
  19. linker = UmlsEntityLinker(resolve_abbreviations=True)
  20. nlp.add_pipe(linker)
  21. logging.info("Model and add-ons successfully loaded.")
  22. return nlp, linker
  23. def extractMedEntities(vectorized_doc, linker):
  24. """
  25. Returns UMLS entities contained in a text.
  26. Args:
  27. vectorized_doc:
  28. linker:
  29. Returns:
  30. UMLS_tuis_entity: dict - key: entity and value: TUI code
  31. """
  32. # Pattern for TUI code
  33. pattern = 'T(\d{3})'
  34. UMLS_tuis_entity = {}
  35. entity_dict = {}
  36. for idx in range(len(vectorized_doc.ents)):
  37. entity = vectorized_doc.ents[idx]
  38. entity_dict[entity] = ''
  39. for umls_ent in entity._.umls_ents:
  40. entity_dict[entity] = linker.umls.cui_to_entity[umls_ent[0]]
  41. # RegEx expression if contains TUI code
  42. tui = re.search(pattern, str(entity_dict[entity]))
  43. if tui:
  44. UMLS_tuis_entity[str(entity)] = tui.group()
  45. else:
  46. UMLS_tuis_entity[str(entity)] = None
  47. return UMLS_tuis_entity
  48. def addTask(datastore_client, doc_title, entities_dict):
  49. """
  50. Upload entities to Datastore.
  51. Args:
  52. datastore_client:
  53. doc_title:
  54. entities_dict:
  55. Returns:
  56. Datastore key object.
  57. """
  58. key = datastore_client.key('case', doc_title)
  59. task = datastore.Entity(key=key)
  60. task.update(
  61. entities_dict
  62. )
  63. datastore_client.put(task)
  64. # Then get by key for this entity
  65. logging.info("Uploaded {} to Datastore.".format(doc_title))
  66. return datastore_client.get(key)
  67. def getCases(datastore_client, filter_dict, limit=10):
  68. """
  69. Get results of query with custom filters
  70. Args:
  71. datastore_client: Client object
  72. filter_dict: dict - e.g {parameter_A: [entity_name_A, entity_name_B],
  73. parameter_B: [entitiy_name_C]
  74. }
  75. limit: int - result limits per default 10
  76. Returns:
  77. results: list - query results
  78. """
  79. query = datastore_client.query(kind='case')
  80. for key, values in filter_dict.items():
  81. for value in values:
  82. query.add_filter(key, '=', value)
  83. results = list(query.fetch(limit=limit))
  84. return results
  85. def populateDatastore(datastore_client, storage_client, model_name, src_bucket='aketari-covid19-data-update'):
  86. """
  87. Extract UMLS entities and store them in a No-SQL db: Datastore.
  88. Args:
  89. datastore_client: Storage client instantiation -1111
  90. storage_client: Storage client instantiation -
  91. model_name: str -
  92. src_bucket: str - contains pdf of the newest files
  93. Returns:
  94. Queriable database
  95. """
  96. lst_curated_blobs = storage_client.list_blobs(bucket_or_name=src_bucket, prefix='curated_eng_txt')
  97. if model_name == 'en_core_sci_sm':
  98. nlp, linker = loadModel(model=en_core_sci_sm)
  99. elif model_name == 'en_core_sci_lg':
  100. nlp, linker = loadModel(model=en_core_sci_lg)
  101. elif model_name == 'en_ner_bc5cdr_md':
  102. nlp, linker = loadModel(model=en_ner_bc5cdr_md)
  103. else:
  104. return False
  105. for blob in lst_curated_blobs:
  106. doc_title = blob.name.split('/')[-1].split('.pdf')[0]
  107. logging.info(doc_title)
  108. # download as string
  109. eng_string = blob.download_as_string().decode('utf-8', "ignore")
  110. # convert to vector
  111. doc = nlp(eng_string)
  112. # Extract medical entities
  113. UMLS_tuis_entity = extractMedEntities(doc, linker)
  114. # Mapping of UMLS entities with reference csv
  115. entities = list(UMLS_tuis_entity.keys())
  116. TUIs = list(UMLS_tuis_entity.values())
  117. df_entities = pd.DataFrame(data={'entity': entities, 'TUIs': TUIs})
  118. df_reference_TUIs = pd.read_csv('./scripts/utils/UMLS_tuis.csv')
  119. df_annotated_text_entities = pd.merge(df_entities, df_reference_TUIs, how='inner', on=['TUIs'])
  120. # Upload entities to datastore
  121. entities_dict = {}
  122. for idx in range(df_annotated_text_entities.shape[0]):
  123. category = df_annotated_text_entities.iloc[idx].values[2]
  124. med_entity = df_annotated_text_entities.iloc[idx].values[0]
  125. # Append to list of entities if the key,value pair already exist
  126. try:
  127. entities_dict[category].append(med_entity)
  128. except:
  129. entities_dict[category] = []
  130. entities_dict[category].append(med_entity)
  131. # API call
  132. key = addTask(datastore_client, doc_title, entities_dict)
  133. logging.info('The upload of {} entities is done.'.format(doc_title))