ner_fcn.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. from google.cloud import datastore
  2. import logging
  3. import re
  4. from scispacy.umls_linking import UmlsEntityLinker
  5. from scispacy.abbreviation import AbbreviationDetector
  6. def loadModel(model):
  7. """
  8. Loading Named Entity Recognition model.
  9. Args:
  10. model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bionlp13cg_md
  11. Returns:
  12. nlp: loaded model
  13. """
  14. # Load the model
  15. nlp = model.load()
  16. # Add pipe features to pipeline
  17. linker = UmlsEntityLinker(resolve_abbreviations=True)
  18. nlp.add_pipe(linker)
  19. # Add the abbreviation pipe to the spacy pipeline.
  20. abbreviation_pipe = AbbreviationDetector(nlp)
  21. nlp.add_pipe(abbreviation_pipe)
  22. logging.info("Model and add-ons successfully loaded.")
  23. return nlp
  24. def extractMedEntities(vectorized_doc):
  25. """
  26. Returns UMLS entities contained in a text.
  27. Args:
  28. vectorized_doc:
  29. Returns:
  30. UMLS_tuis_entity: dict - key: entity and value: TUI code
  31. """
  32. # Pattern for TUI code
  33. pattern = 'T(\d{3})'
  34. UMLS_tuis_entity = {}
  35. entity_dict = {}
  36. linker = UmlsEntityLinker(resolve_abbreviations=True)
  37. for idx in range(len(vectorized_doc.ents)):
  38. entity = vectorized_doc.ents[idx]
  39. entity_dict[entity] = ''
  40. for umls_ent in entity._.umls_ents:
  41. entity_dict[entity] = linker.umls.cui_to_entity[umls_ent[0]]
  42. # RegEx expression if contains TUI code
  43. tui = re.search(pattern, str(entity_dict[entity]))
  44. if tui:
  45. UMLS_tuis_entity[str(entity)] = tui.group()
  46. else:
  47. UMLS_tuis_entity[str(entity)] = None
  48. return UMLS_tuis_entity
  49. def addTask(datastore_client, doc_title, entities_dict):
  50. """
  51. Upload entities to Datastore.
  52. Args:
  53. datastore_client:
  54. doc_title:
  55. entities_dict:
  56. Returns:
  57. Datastore key object.
  58. """
  59. key = datastore_client.key('case', doc_title)
  60. task = datastore.Entity(key=key)
  61. task.update(
  62. entities_dict
  63. )
  64. datastore_client.put(task)
  65. # Then get by key for this entity
  66. logging.info("Uploaded {} to Datastore.".format(doc_title))
  67. return datastore_client.get(key)
  68. def getCases(datastore_client, filter_dict, limit=10):
  69. """
  70. Get results of query with custom filters
  71. Args:
  72. datastore_client: Client object
  73. filter_dict: dict - e.g {parameter_A: [entity_name_A, entity_name_B],
  74. parameter_B: [entitiy_name_C]
  75. }
  76. limit: int - result limits per default 10
  77. Returns:
  78. results: list - query results
  79. """
  80. query = datastore_client.query(kind='case')
  81. for key, values in filter_dict.items():
  82. for value in values:
  83. query.add_filter(key, '=', value)
  84. results = list(query.fetch(limit=limit))
  85. return results