Aziz Ketari 4 жил өмнө
parent
commit
f8244e1779

BIN
.DS_Store


+ 6 - 6
README.md

@@ -1,5 +1,5 @@
 # COVID-19 public dataset on GCP from cases in Italy
-> by the Italian Society of Medical and Interventional Radiology (ISMIR)
+> Medical notes and entities from TRUE patient cases publicly available on BigQuery and Datastore!
 
 This repository contains all the code required to extract relevant information from pdf documents published by ISMIR 
 and store raw data in  a relational database and entities in a No-SQL database.
@@ -27,8 +27,10 @@ You can replicate this pipeline directly on your local machine or on the cloud s
 - You need a Google Cloud project and IAM rights to create service accounts.
 - Create and Download the json key associated with your Service Account. Useful [link](https://cloud.google.com/iam/docs/creating-managing-service-account-keys#iam-service-account-keys-create-python)
 - Modify the values to each variables in env_variables.sh file then run
+
 ```
-source env_variables.sh
+cd ./covid19_ISMIR
+source ./content/env_variables.sh
 ```
 
 - Set the project that you will be working on:
@@ -50,7 +52,6 @@ sudo python3 get-pip.py
 `ERROR: Package 'scispacy' requires a different Python: 3.5.3 not in '>=3.6.0'`
 
 ```
-cd ~/covid19_ISMIR
 pip3 install --user -r requirements.txt
 ```
 
@@ -65,10 +66,9 @@ will automatically download a model for you and install it.
 
 - **Step 1:** Download the required files to your bucket and load the required model in your local  
 (this step will take ~10 min)
-> Optional: If you have already downloaded the scispacy model, you should modify the file ./content/download_content.sh to not repeat that step
+> Optional: If you have already downloaded the scispacy models, you should modify the file ./content/download_content.sh to not repeat that step
 ```
 source ./content/download_content.sh
-pip install -U ./scispacy_models/en_core_sci_lg-0.2.4.tar.gz
 ```
 
 - **Step 2:** Start the extraction of text from the pdf documents  
@@ -85,7 +85,7 @@ Following the pre-processing, it's time to store the data in a more searchable f
 [BigQuery](https://cloud.google.com/bigquery) - for the text, and a No-SQL database - 
 [Datastore](https://cloud.google.com/datastore) - for the (UMLS) medical entities. 
 
-`python3 ./scripts/storing.py`
+`python3 ./scripts/storing.py True True [Model_of_your_choice]`
 
 ## Test
 Last but not least, you can query your databases using this script.

BIN
content/.DS_Store


+ 8 - 2
content/download_content.sh

@@ -1,3 +1,9 @@
 gsutil -m cp -r gs://covid19-public-dataset-aketari/pdf/* gs://$BUCKET_NAME/pdf/
-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./scispacy_models/en_core_sci_lg-0.2.4.tar.gz
-gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./scispacy_models/en_core_sci_sm-0.2.4.tar.gz
+gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_lg-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
+gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_core_sci_sm-0.2.4.tar.gz ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
+gsutil -m cp -r gs://covid19-public-dataset-aketari/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz
+
+# Installing all supported NER models
+pip install -U ./content/scispacy_models/en_core_sci_sm-0.2.4.tar.gz
+pip install -U ./content/scispacy_models/en_core_sci_lg-0.2.4.tar.gz
+pip install -U ./content/scispacy_models/en_ner_bc5cdr_md-0.2.4.tar.gz

+ 0 - 0
env_variables.sh → content/env_variables.sh


+ 3 - 3
requirements.txt

@@ -4,7 +4,7 @@ google-cloud-bigquery==1.24.0
 google-cloud-datastore==1.11.0
 google-cloud-translate==2.0.1
 google-cloud-vision==1.0.0
-google-oauth2-tool====0.0.3
+google-oauth2-tool==0.0.3
 googleapis-common-protos==1.51.0
-pandas==1.0.3
-scispacy==0.2.4
+pandas
+scispacy

+ 12 - 11
scripts/preprocessing.py

@@ -30,7 +30,8 @@ customize_stop_words = [
     'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he',
     'â', '€', 's', 'b', 'case', 'Cuoladi', 'l', 'c', 'ra', 'bergamo', 'patelli', 'est', 'asst',
     'dr', 'Dianluigi', 'Svizzero', 'i', 'riccardo', 'Alessandro', 'Spinazzola', 'angelo',
-    'maggiore', 'p', 'r', 't', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u'
+    'maggiore', 'p', 'r', 't', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u',
+    'man', 'female', 'D'
 ]
 
 start_time = time.time()
@@ -42,16 +43,16 @@ for blob in lst_raw_txt_blobs:
     processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
 
     # Translateba raw text to english
-    #try:
-    batch_translate_text(translate_client=translate_client,
-                         project_id=project_id,
-                         input_uri=txt_gcs_dest_path,
-                         output_uri=eng_txt_gcs_dest_path)
-    logging.info("Translation of {} document was successful.".format(doc_title))
-    # except Exception as e:
-    #     logging.error("Error", e)
-
-    # Process eng raw text
+    try:
+        batch_translate_text(translate_client=translate_client,
+                             project_id=project_id,
+                             input_uri=txt_gcs_dest_path,
+                             output_uri=eng_txt_gcs_dest_path)
+        logging.info("Translation of {} document was successful.".format(doc_title))
+    except Exception as e:
+        logging.error("Error", e)
+
+    # Curate eng raw text
     blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
                                                                         bucket_name,
                                                                         doc_title)

+ 6 - 5
scripts/retrieving.py

@@ -1,11 +1,11 @@
 from google.cloud import storage, bigquery, datastore
 from google.oauth2 import service_account
-from utils.bq_fcn import returnQueryResults
+from utils.bq_fcn import returnQueryResults, constructQuery
 from utils.ner_fcn import getCases
-
+import os
 import logging
 logging.getLogger().setLevel(logging.INFO)
-import os
+
 
 project_id = os.getenv('PROJECT_ID')
 bucket_name = os.getenv('BUCKET_NAME')
@@ -23,13 +23,14 @@ datastore_client = datastore.Client(credentials=credentials)
 
 # Returns a list of results
 try:
-    results_lst = returnQueryResults(bq_client, project_id, dataset_name, table_name, case_id)
+    query = constructQuery(column_lst=['*'], case_id='case23')
+    results_lst = returnQueryResults(bq_client, query)
     logging.info("Here is the result of the test query: \n {}".format(results_lst))
 except Exception as e:
     logging.error("Error", e)
 
 try:
-    filter_dict = {'Sign or Symptom':['onset symptoms', "chills"]}
+    filter_dict = {'Sign or Symptom': ['onset symptoms', "chills"]}
     results = getCases(datastore_client, filter_dict, limit=10)
     logging.info("Here is the result of the test query: \n {}".format(results))
 except Exception as e:

+ 53 - 107
scripts/storing.py

@@ -1,32 +1,44 @@
 from google.cloud import storage, bigquery, datastore
 from google.oauth2 import service_account
-from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ
-from utils.ner_fcn import loadModel, addTask, extractMedEntities
-from scispacy.umls_linking import UmlsEntityLinker
-from scispacy.abbreviation import AbbreviationDetector
-
+from utils.bq_fcn import populateBQ
+from utils.ner_fcn import populateDatastore
 import logging
+import argparse
+import os
+import time
+
+# Importing the models
 logging.getLogger().setLevel(logging.INFO)
 
-try:
-    import en_core_sci_sm
-except:
-    logging.warning("404: en_core_sci_sm NOT FOUND. Make sure the model was downloaded and installed.")
+# Create the parser
+parser = argparse.ArgumentParser(description='Select the model of interest.')
 
-try:
-    import en_core_sci_lg
-except:
-    logging.warning("404: en_core_sci_lg NOT FOUND. Make sure the model was downloaded and installed.")
-try:
-    import en_ner_bionlp13cg_md
-except:
-    logging.warning("404: en_ner_bionlp13cg_md NOT FOUND. Make sure the model was downloaded and installed.")
+# Add the arguments
+parser.add_argument('store_bigquery',
+                    metavar='bool',
+                    choices=['True', 'False'],
+                    help='Store data in BigQuery. Options: True or False')
 
+parser.add_argument('store_datastore',
+                    metavar='bool',
+                    choices=['True', 'False'],
+                    help='Store data in Datastore. Options: True or False')
+
+model_choices = ['en_core_sci_sm', 'en_core_sci_lg', 'en_ner_bc5cdr_md']
+parser.add_argument('model_name',
+                    metavar='name',
+                    type=str,
+                    help='Model options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md')
+
+# Execute the parse_args() method
+args = parser.parse_args()
+if args.store_datastore == 'True' and not args.model_name:
+    parser.error('--storing in datastore can only be done when --model_name is set to a specific model.')
+elif args.store_datastore == 'True' and args.model_name not in model_choices:
+    parser.error('--storing in datastore can only be done when --model_name is among the supported models: {}.'.format(model_choices))
 
-import time
-import os
-import pandas as pd
 
+model_name = args['model_name']
 project_id = os.getenv('PROJECT_ID')
 bucket_name = os.getenv('BUCKET_NAME')
 location = os.getenv('LOCATION')
@@ -42,90 +54,24 @@ datastore_client = datastore.Client(credentials=credentials)
 
 bq_client = bigquery.Client(credentials=credentials)
 
-gcs_source_prefix = 'raw_txt'
-lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
-                                      prefix=gcs_source_prefix)
-
-start_time = time.time()
-
-try:
-    dataset_id = bqCreateDataset(bq_client, dataset_name)
-    logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
-except Exception as e:
-    logging.error("An error occurred.", e)
-
-try:
-    table_id = bqCreateTable(bq_client, dataset_id, table_name)
-    logging.info("The following table {} was successfully created/retrieved.".format(table_name))
-except Exception as e:
-    logging.error("An error occurred.", e)
-
-for blob in lst_blobs:
-    doc_title = blob.name.split('/')[-1].split('.txt')[0]
-
-    # download as string
-    it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
-
-    # set the GCS path
-    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title)
-    eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
-
-    # Upload blob of interest
-    curated_eng_blob = storage_client.get_bucket(bucket_name) \
-        .get_blob('curated_eng_txt/{}.txt'.format(doc_title))
-
-    # populate to BQ dataset
-    exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
-
-total_time = time.time() - start_time
-logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1)))
-
-curated_gcs_source_prefix = 'curated_eng_txt'
-lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
-                                              prefix=curated_gcs_source_prefix)
-
-nlp = loadModel(model=en_core_sci_sm)
-
-start_time = time.time()
-for blob in lst_curated_blobs:
-    doc_title = blob.name.split('/')[-1].split('.txt')[0]
-
-    # download as string
-    eng_string = blob.download_as_string().decode('utf-8')
-
-    # convert to vector
-    doc = nlp(eng_string)
-
-    # Extract medical entities
-    UMLS_tuis_entity = extractMedEntities(doc)
-
-    # Generate dataframes
-    entities = list(UMLS_tuis_entity.keys())
-    TUIs = list(UMLS_tuis_entity.values())
-    df_entities = pd.DataFrame(data={'entity': entities, 'TUIs': TUIs})
-    df_reference_TUIs = pd.read_csv('./utils/UMLS_tuis.csv')
-    df_annotated_text_entities = pd.merge(df_entities, df_reference_TUIs, how='inner', on=['TUIs'])
-
-    # Upload entities to datastore
-    entities_dict = {}
-    for idx in range(df_annotated_text_entities.shape[0]):
-        category = df_annotated_text_entities.iloc[idx].values[2]
-        med_entity = df_annotated_text_entities.iloc[idx].values[0]
-
-        # Append to list of entities if the key,value pair already exist
-        try:
-            entities_dict[category].append(med_entity)
-        except:
-            entities_dict[category] = []
-            entities_dict[category].append(med_entity)
-
-        # API call
-    key = addTask(datastore_client, doc_title, entities_dict)
-    logging.info('The upload of {} entities is done.'.format(doc_title))
-
-total_time = time.time() - start_time
-logging.info(
-    "The export to Datastore was completed successfully and took {} minutes.".format(round(total_time / 60, 1)))
-
-
-
+if args.store_bigquery == 'True':
+    start_time = time.time()
+    populateBQ(bq_client=bq_client,storage_client=storage_client,
+               bucket_name=bucket_name, dataset_name=dataset_name,
+               table_name=table_name)
+    total_time = time.time() - start_time
+    logging.info(
+        'The export to BigQuery was completed successfully and took {} seconds.'.format(round(total_time, 1)))
+else:
+    logging.info('The export to BigQuery was disable.')
+
+if args.store_datastore == 'True':
+    start_time = time.time()
+    populateDatastore(datastore_client=datastore_client, storage_client=storage_client,
+                      bucket_name=bucket_name, model_name=model_name)
+    total_time = time.time() - start_time
+    logging.info(
+        "The export to Datastore was completed successfully and took {} seconds.".format(round(total_time, 1)))
+
+else:
+    logging.info('The export to Datastore was disable.')

+ 91 - 23
scripts/utils/bq_fcn.py

@@ -6,6 +6,7 @@ def bqCreateDataset(bq_client, dataset_name):
     """
     Creates a dataset on Google Cloud Platform.
     Args:
+        bq_client - BigQuery client instantiation -
         dataset_name: str - Name of the dataset
     Returns:
         dataset_id: str - Reference id for the dataset just created
@@ -27,6 +28,7 @@ def bqCreateTable(bq_client, dataset_id, table_name):
     """
     Create main table with all cases and the medical text.
     Args:
+        bq_client: BigQuery client instantiation -
         dataset_id: str - Reference id for the dataset to use
         table_name: str - Name of the table to create
 
@@ -57,15 +59,16 @@ def exportItems2BQ(bq_client, dataset_id, table_id, case, it_raw_blob, eng_raw_b
     """
     Export text data to BigQuery.
     Args:
-        dataset_id:
-        table_id:
-        case:
-        it_raw_blob:
-        eng_raw_blob:
-        curated_eng_blob:
+        bq_client: BigQuery client instance -
+        dataset_id: str -
+        table_id: str -
+        case: str -
+        it_raw_blob:gcs blob object -
+        eng_raw_blob: gcs blob object -
+        curated_eng_blob: gcs blob object -
 
     Returns:
-
+        Logging completion
     """
     # Prepares a reference to the dataset
     dataset_ref = bq_client.dataset(dataset_id)
@@ -85,32 +88,97 @@ def exportItems2BQ(bq_client, dataset_id, table_id, case, it_raw_blob, eng_raw_b
                        }]
     errors = bq_client.insert_rows(table, rows_to_insert)  # API request
     assert errors == []
-    logging.info('{} was added to {} dataset, specifically in {} table.'.format(case,
-                                                                                dataset_id,
-                                                                                table_id))
+    return logging.info('{} was added to {} dataset, specifically in {} table.'.format(case,
+                                                                                       dataset_id,
+                                                                                       table_id))
+
 
+def constructQuery(column_lst, case_id):
+    """
+    Construct the query to public dataset: aketari-covid19-public.covid19.ISMIR
+    Args:
+        column_lst: list - ["*"] or ["column_name1", "column_name2" ...]
+        case_id: str - Optional e.g "case1"
 
-def returnQueryResults(bq_client, project_id, dataset_id, table_id, case_id):
+    Returns:
+        query object
+    """
+    # Public dataset
+    # project_id = 'aketari-covid19-public'
+    # dataset_id = 'covid19'
+    # table_id = 'ISMIR'
+
+    if (len(column_lst) == 1) and column_lst[0] == "*":
+        query = ('SELECT * FROM `aketari-covid19-public.covid19.ISMIR` '
+                 'WHERE `case`="{}" '.format(case_id))
+        return query
+    else:
+        columns_str = ", ".join(column_lst)
+        query = ('SELECT {} FROM `aketari-covid19-public.covid19.ISMIR` '
+                 'WHERE `case`="{}" '.format(columns_str, case_id))
+        return query
+
+
+def returnQueryResults(bq_client, query):
     """
     Get results from a BigQuery query.
     Args:
-        bq_client:
-        project_id:
-        dataset_id:
-        table_id:
-        case_id:
+        bq_client: BigQuery client instantiation -
+        query: query object
 
     Returns:
+        list of all rows of the query
+    """
+
+    try:
+        query_job = bq_client.query(query)
+        return list(query_job.result())
+    except Exception as e:
+        return logging.error("Error", e)
+
 
+def populateBQ(bq_client, storage_client, bucket_name, dataset_name, table_name):
     """
+    Populate BigQuery dataset.
+    Args:
+        bq_client: BigQuery client instantiation -
+        storage_client:
+        bucket_name:
+        dataset_name:
+        table_name:
 
-    query = ('SELECT * FROM `{}.{}.{}` WHERE `case`="{}" LIMIT 1'.format(project_id, dataset_id, table_id, case_id))
+    Returns:
+        Populated BigQuery data warehouse
+    """
+    try:
+        dataset_id = bqCreateDataset(bq_client, dataset_name)
+        logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
+    except Exception as e:
+        logging.error("An error occurred.", e)
 
     try:
-        query_job = bq_client.query(query)
-        is_exist = len(list(query_job.result())) >= 1
-        logging.info('Query case id: {}'.format(case_id) if is_exist \
-                         else "Case id: {} does NOT exist".format(case_id))
-        logging.info(list(query_job.result()))
+        table_id = bqCreateTable(bq_client, dataset_id, table_name)
+        logging.info("The following table {} was successfully created/retrieved.".format(table_name))
     except Exception as e:
-        logging.error("Error", e)
+        logging.error("An error occurred.", e)
+
+    gcs_source_prefix = 'raw_txt'
+    lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
+                                          prefix=gcs_source_prefix)
+
+    for blob in lst_blobs:
+        doc_title = blob.name.split('/')[-1].split('.txt')[0]
+
+        # download as string
+        it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
+
+        # set the GCS path
+        path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title)
+        eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
+
+        # Upload blob of interest
+        curated_eng_blob = storage_client.get_bucket(bucket_name) \
+            .get_blob('curated_eng_txt/{}.txt'.format(doc_title))
+
+        # populate to BQ dataset
+        exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)

+ 85 - 12
scripts/utils/ner_fcn.py

@@ -1,20 +1,34 @@
 from google.cloud import datastore
-
+from scispacy.umls_linking import UmlsEntityLinker
 import logging
+import pandas as pd
 import re
 
-from scispacy.umls_linking import UmlsEntityLinker
-from scispacy.abbreviation import AbbreviationDetector
+def importModel(model_name):
+    """
+    Selective import of the required model from scispacy.
+    Args:
+        model_name: str -
 
+    Returns:
+
+    """
+    if model_name == 'en_core_sci_sm':
+        import en_core_sci_sm
+    elif model_name == 'en_core_sci_lg':
+        import en_core_sci_lg
+    elif model_name == 'en_ner_bc5cdr_md':
+        import en_ner_bc5cdr_md
 
 def loadModel(model):
     """
     Loading Named Entity Recognition model.
     Args:
-        model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bionlp13cg_md
+        model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md
 
     Returns:
         nlp: loaded model
+        linker: loaded add-on
     """
     # Load the model
     nlp = model.load()
@@ -23,19 +37,16 @@ def loadModel(model):
     linker = UmlsEntityLinker(resolve_abbreviations=True)
     nlp.add_pipe(linker)
 
-    # Add the abbreviation pipe to the spacy pipeline.
-    abbreviation_pipe = AbbreviationDetector(nlp)
-    nlp.add_pipe(abbreviation_pipe)
     logging.info("Model and add-ons successfully loaded.")
-    return nlp
+    return nlp, linker
 
 
-def extractMedEntities(vectorized_doc):
+def extractMedEntities(vectorized_doc, linker):
     """
     Returns UMLS entities contained in a text.
     Args:
         vectorized_doc:
-
+        linker:
     Returns:
         UMLS_tuis_entity: dict - key: entity and value: TUI code
     """
@@ -45,8 +56,6 @@ def extractMedEntities(vectorized_doc):
     UMLS_tuis_entity = {}
     entity_dict = {}
 
-    linker = UmlsEntityLinker(resolve_abbreviations=True)
-
     for idx in range(len(vectorized_doc.ents)):
         entity = vectorized_doc.ents[idx]
         entity_dict[entity] = ''
@@ -104,3 +113,67 @@ def getCases(datastore_client, filter_dict, limit=10):
             query.add_filter(key, '=', value)
     results = list(query.fetch(limit=limit))
     return results
+
+
+def populateDatastore(datastore_client, storage_client, bucket_name, model_name):
+    """
+    Extract UMLS entities and store them in a No-SQL db: Datastore.
+    Args:
+        datastore_client: Storage client instantiation -
+        storage_client: Storage client instantiation -
+        bucket_name: str -
+        model_name: str -
+
+    Returns:
+        Queriable database
+    """
+    curated_gcs_source_prefix = 'curated_eng_txt'
+    lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
+                                                  prefix=curated_gcs_source_prefix)
+
+    importModel(model_name)
+
+    if model_name == 'en_core_sci_sm':
+        nlp, linker = loadModel(model=en_core_sci_sm)
+    elif model_name == 'en_core_sci_lg':
+        nlp, linker = loadModel(model=en_core_sci_lg)
+    elif model_name == 'en_ner_bc5cdr_md':
+        nlp, linker = loadModel(model=en_ner_bc5cdr_md)
+    else:
+        return False
+
+    for blob in lst_curated_blobs:
+        doc_title = blob.name.split('/')[-1].split('.txt')[0]
+
+        # download as string
+        eng_string = blob.download_as_string().decode('utf-8')
+
+        # convert to vector
+        doc = nlp(eng_string)
+
+        # Extract medical entities
+        UMLS_tuis_entity = extractMedEntities(doc, linker)
+
+        # Mapping of UMLS entities with reference csv
+        entities = list(UMLS_tuis_entity.keys())
+        TUIs = list(UMLS_tuis_entity.values())
+        df_entities = pd.DataFrame(data={'entity': entities, 'TUIs': TUIs})
+        df_reference_TUIs = pd.read_csv('./scripts/utils/UMLS_tuis.csv')
+        df_annotated_text_entities = pd.merge(df_entities, df_reference_TUIs, how='inner', on=['TUIs'])
+
+        # Upload entities to datastore
+        entities_dict = {}
+        for idx in range(df_annotated_text_entities.shape[0]):
+            category = df_annotated_text_entities.iloc[idx].values[2]
+            med_entity = df_annotated_text_entities.iloc[idx].values[0]
+
+            # Append to list of entities if the key,value pair already exist
+            try:
+                entities_dict[category].append(med_entity)
+            except:
+                entities_dict[category] = []
+                entities_dict[category].append(med_entity)
+
+            # API call
+        key = addTask(datastore_client, doc_title, entities_dict)
+        logging.info('The upload of {} entities is done.'.format(doc_title))