In [1]:
from google.cloud import vision
from google.cloud import storage
from google.protobuf import json_format
from google.cloud import translate
from google.cloud import bigquery
from google.cloud import datastore
import logging

import re
import time
import pandas as pd
import numpy as np

#!sudo pip3 install scispacy
import scispacy
from spacy import displacy
#https://github.com/explosion/spacy-models/releases/download/en_core_sci_sm-2.2.0/en_core_sci_sm-2.2.0.tar.gz
#https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz
import en_core_sci_lg # en_ner_bionlp13cg_md, en_core_sci_sm
from scispacy.umls_linking import UmlsEntityLinker
from scispacy.abbreviation import AbbreviationDetector

References:

  • ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing
In [2]:
project_id = "pm-preparation"
location = "us-central1"
storage_client = storage.Client()
vision_client = vision.ImageAnnotatorClient()
translate_client = translate.TranslationServiceClient()
datastore_client = datastore.Client()
bq_client = bigquery.Client()
In [3]:
def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    doc_title = gcs_source_uri.split('/')[-1].split('.pdf')[0]
    
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size= 20

    feature = vision.types.Feature(
        type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
    input_config = vision.types.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.types.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.types.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = vision_client.async_batch_annotate_files(
        requests=[async_request])

    #print('Waiting for the operation to finish.')
    operation.result(timeout=180)
    print('Text extraction from document {} is completed.'.format(doc_title))
In [4]:
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
def read_json_result(json_gcs_path, doc_title):
    gcs_destination_prefix = 'json/' + '{}-'.format(doc_title)
    
    # List objects with the given prefix.
    blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,
                                               prefix=gcs_destination_prefix))
    all_text = ''
    for blob in blob_list:

        json_string = blob.download_as_string()
        response = json_format.Parse(
            json_string, vision.types.AnnotateFileResponse())

        # The actual response for the first page of the input file.
        
        for response in response.responses:   
        #first_page_response = response.responses[0]
            text_response = response.full_text_annotation.text
            all_text += text_response
            all_text += ' '

    print("Parsed json doc: {}".format(doc_title))
    return all_text
In [5]:
def upload_blob(txt_content, destination_blob_name):
    """Uploads a file to the bucket."""
    destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_string(txt_content)

    print("Text uploaded to {}".format(destination_blob_name))
In [6]:
def batch_translate_text(
    input_uri="gs://YOUR_BUCKET_ID/path/to/your/file.txt",
    output_uri="gs://YOUR_BUCKET_ID/path/to/save/results/"):
    """Translates a batch of texts on GCS and stores the result in a GCS location."""

    # Supported file types: https://cloud.google.com/translate/docs/supported-formats
    gcs_source = {"input_uri": input_uri}

    input_configs_element = {
        "gcs_source": gcs_source,
        "mime_type": "text/plain"  # Can be "text/plain" or "text/html".
    }
    gcs_destination = {"output_uri_prefix": output_uri}
    output_config = {"gcs_destination": gcs_destination}
    parent = translate_client.location_path(project_id, location)

    # Supported language codes: https://cloud.google.com/translate/docs/language
    operation = translate_client.batch_translate_text(
        parent=parent,
        source_language_code="it",
        target_language_codes=["en"],  # Up to 10 language codes here.
        input_configs=[input_configs_element],
        output_config=output_config)

    response = operation.result(180)
In [7]:
def removePunctuation(string): 
  
    # punctuation marks 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, "") 
  
    # Print string without punctuation 
    return string

This step will take 1hr and 20 min approx

In [8]:
customize_stop_words = [
    'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',
    'Borgheresi', 'Agostini', 'Ottaviani', 'Floridi', 'Giovagnoni', 'di', 'specialization',
    'Polytechnic', 'University', 'marche', 'ANCONA', 'Italy', 'Azienda', 'Ospedali', 
    'Riuniti', 'Yorrette', 'Matera', 'Michele', 'Nardella', 'Gerardo', 'Costanzo',
    'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he',
    'â', '€','s','b','case','Cuoladi','l','c','ra','bergamo','patelli','est','asst',
    'dr','Dianluigi', 'Svizzero','i','riccardo','Alessandro','Spinazzola','angelo',
    'maggiore', 'p' ,'r' ,'t', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u'
]
In [15]:
# Process documents

bucket_name = 'covid19-public-dataset-aketari'
gcs_source_prefix = 'pdf'
lst_pdf_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
                                      prefix='pdf')

lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
                                      prefix='json')

overall_start_time = time.time()
for blob in lst_pdf_blobs:
    doc_title = blob.name.split('/')[-1].split('.pdf')[0]
    #files_metadata[doc_title] = {}

    # Generate all paths
    gcs_source_path = 'gs://' + bucket_name +'/' + blob.name

    #start_time = time.time()
    # OCR pdf documents
    async_detect_document(vision_client, 
                        gcs_source_path,
                        json_gcs_dest_path)
print ('OCR done.')
    
for blob in lst_json_blobs:
    doc_title = blob.name.split('/')[-1].split('-')[0]
    
    json_gcs_dest_path = 'gs://' + bucket_name + '/{}'.format(blob.name)
    txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'
    eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)
    processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'
    
    # Parse json
    all_text = read_json_result(json_gcs_dest_path, doc_title)
    #files_metadata[doc_title]['text'] = all_text

    # Upload raw text to GCS
    upload_blob(all_text, txt_gcs_dest_path)

    # Translate raw text to english
    batch_translate_text(input_uri = txt_gcs_dest_path,
                        output_uri = eng_txt_gcs_dest_path)
    
    # Process eng raw text
    blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
                                                                         bucket_name,
                                                                         doc_title)
    
    eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)
    eng_raw_string = eng_blob.download_as_string().decode('utf-8')
    
    # lowercase
    #sample_text = eng_raw_string.lower()

    # Remove dates
    # 1 or 2 digit number followed by back slash followed by 1 or 2 digit number ...
    pattern_dates = '(\d{1,2})/(\d{1,2})/(\d{4})'
    pattern_fig = 'Figure (\d{1,2})' 
    pattern_image = '^Image .$'
    replace = ''

    eng_raw_string = re.sub(pattern_dates, replace, eng_raw_string) 
    eng_raw_string = re.sub(pattern_fig, replace, eng_raw_string)
    eng_raw_string = re.sub(pattern_image, replace, eng_raw_string) 
    
    # remove punctuation and special characters
    eng_raw_string = re.sub('[^A-Za-z0-9]+', ' ', eng_raw_string)

    # Remove custom stop words
    tokens = [token for token in eng_raw_string.split()if token not in customize_stop_words]

    refined_doc = ''
    for word in tokens:
        refined_doc += ' {}'.format(word)
    
    # Upload raw text to GCS
    upload_blob(refined_doc, processed_eng_gcs_dest_path)

    #print('refinement completed')
    print('{} processing is done.'.format(doc_title))
Parsed json doc: case1
Text uploaded to raw_txt/case1.txt
Text uploaded to curated_eng_txt/case1.txt
case1 processing is done.
Parsed json doc: case10
Text uploaded to raw_txt/case10.txt
Text uploaded to curated_eng_txt/case10.txt
case10 processing is done.
Parsed json doc: case11
Text uploaded to raw_txt/case11.txt
Text uploaded to curated_eng_txt/case11.txt
case11 processing is done.
Parsed json doc: case12
Text uploaded to raw_txt/case12.txt
Text uploaded to curated_eng_txt/case12.txt
case12 processing is done.
Parsed json doc: case13
Text uploaded to raw_txt/case13.txt
Text uploaded to curated_eng_txt/case13.txt
case13 processing is done.
Parsed json doc: case14
Text uploaded to raw_txt/case14.txt
Text uploaded to curated_eng_txt/case14.txt
case14 processing is done.
Parsed json doc: case15
Text uploaded to raw_txt/case15.txt
Text uploaded to curated_eng_txt/case15.txt
case15 processing is done.
Parsed json doc: case16
Text uploaded to raw_txt/case16.txt
Text uploaded to curated_eng_txt/case16.txt
case16 processing is done.
Parsed json doc: case17
Text uploaded to raw_txt/case17.txt
Text uploaded to curated_eng_txt/case17.txt
case17 processing is done.
Parsed json doc: case18
Text uploaded to raw_txt/case18.txt
Text uploaded to curated_eng_txt/case18.txt
case18 processing is done.
Parsed json doc: case19
Text uploaded to raw_txt/case19.txt
Text uploaded to curated_eng_txt/case19.txt
case19 processing is done.
Parsed json doc: case2
Text uploaded to raw_txt/case2.txt
Text uploaded to curated_eng_txt/case2.txt
case2 processing is done.
Parsed json doc: case20
Text uploaded to raw_txt/case20.txt
Text uploaded to curated_eng_txt/case20.txt
case20 processing is done.
Parsed json doc: case21
Text uploaded to raw_txt/case21.txt
Text uploaded to curated_eng_txt/case21.txt
case21 processing is done.
Parsed json doc: case22
Text uploaded to raw_txt/case22.txt
Text uploaded to curated_eng_txt/case22.txt
case22 processing is done.
Parsed json doc: case23
Text uploaded to raw_txt/case23.txt
Text uploaded to curated_eng_txt/case23.txt
case23 processing is done.
Parsed json doc: case24
Text uploaded to raw_txt/case24.txt
Text uploaded to curated_eng_txt/case24.txt
case24 processing is done.
Parsed json doc: case25
Text uploaded to raw_txt/case25.txt
Text uploaded to curated_eng_txt/case25.txt
case25 processing is done.
Parsed json doc: case26
Text uploaded to raw_txt/case26.txt
Text uploaded to curated_eng_txt/case26.txt
case26 processing is done.
Parsed json doc: case27
Text uploaded to raw_txt/case27.txt
Text uploaded to curated_eng_txt/case27.txt
case27 processing is done.
Parsed json doc: case28
Text uploaded to raw_txt/case28.txt
Text uploaded to curated_eng_txt/case28.txt
case28 processing is done.
Parsed json doc: case29
Text uploaded to raw_txt/case29.txt
Text uploaded to curated_eng_txt/case29.txt
case29 processing is done.
Parsed json doc: case3
Text uploaded to raw_txt/case3.txt
Text uploaded to curated_eng_txt/case3.txt
case3 processing is done.
Parsed json doc: case30
Text uploaded to raw_txt/case30.txt
Text uploaded to curated_eng_txt/case30.txt
case30 processing is done.
Parsed json doc: case31
Text uploaded to raw_txt/case31.txt
Text uploaded to curated_eng_txt/case31.txt
case31 processing is done.
Parsed json doc: case32
Text uploaded to raw_txt/case32.txt
Text uploaded to curated_eng_txt/case32.txt
case32 processing is done.
Parsed json doc: case33
Text uploaded to raw_txt/case33.txt
Text uploaded to curated_eng_txt/case33.txt
case33 processing is done.
Parsed json doc: case34
Text uploaded to raw_txt/case34.txt
Text uploaded to curated_eng_txt/case34.txt
case34 processing is done.
Parsed json doc: case35
Text uploaded to raw_txt/case35.txt
Text uploaded to curated_eng_txt/case35.txt
case35 processing is done.
Parsed json doc: case36
Text uploaded to raw_txt/case36.txt
Text uploaded to curated_eng_txt/case36.txt
case36 processing is done.
Parsed json doc: case37
Text uploaded to raw_txt/case37.txt
Text uploaded to curated_eng_txt/case37.txt
case37 processing is done.
Parsed json doc: case38
Text uploaded to raw_txt/case38.txt
Text uploaded to curated_eng_txt/case38.txt
case38 processing is done.
Parsed json doc: case39
Text uploaded to raw_txt/case39.txt
Text uploaded to curated_eng_txt/case39.txt
case39 processing is done.
Parsed json doc: case4
Text uploaded to raw_txt/case4.txt
Text uploaded to curated_eng_txt/case4.txt
case4 processing is done.
Parsed json doc: case40
Text uploaded to raw_txt/case40.txt
Text uploaded to curated_eng_txt/case40.txt
case40 processing is done.
Parsed json doc: case41
Text uploaded to raw_txt/case41.txt
Text uploaded to curated_eng_txt/case41.txt
case41 processing is done.
Parsed json doc: case42
Text uploaded to raw_txt/case42.txt
Text uploaded to curated_eng_txt/case42.txt
case42 processing is done.
Parsed json doc: case43
Text uploaded to raw_txt/case43.txt
Text uploaded to curated_eng_txt/case43.txt
case43 processing is done.
Parsed json doc: case44
Text uploaded to raw_txt/case44.txt
Text uploaded to curated_eng_txt/case44.txt
case44 processing is done.
Parsed json doc: case45
Text uploaded to raw_txt/case45.txt
Text uploaded to curated_eng_txt/case45.txt
case45 processing is done.
Parsed json doc: case46
Text uploaded to raw_txt/case46.txt
Text uploaded to curated_eng_txt/case46.txt
case46 processing is done.
Parsed json doc: case47
Text uploaded to raw_txt/case47.txt
Text uploaded to curated_eng_txt/case47.txt
case47 processing is done.
Parsed json doc: case48
Text uploaded to raw_txt/case48.txt
Text uploaded to curated_eng_txt/case48.txt
case48 processing is done.
Parsed json doc: case49
Text uploaded to raw_txt/case49.txt
Text uploaded to curated_eng_txt/case49.txt
case49 processing is done.
Parsed json doc: case5
Text uploaded to raw_txt/case5.txt
Text uploaded to curated_eng_txt/case5.txt
case5 processing is done.
Parsed json doc: case6
Text uploaded to raw_txt/case6.txt
Text uploaded to curated_eng_txt/case6.txt
case6 processing is done.
Parsed json doc: case7
Text uploaded to raw_txt/case7.txt
Text uploaded to curated_eng_txt/case7.txt
case7 processing is done.
Parsed json doc: case8
Text uploaded to raw_txt/case8.txt
Text uploaded to curated_eng_txt/case8.txt
case8 processing is done.
Parsed json doc: case9
Text uploaded to raw_txt/case9.txt
Text uploaded to curated_eng_txt/case9.txt
case9 processing is done.

Upload to bigquery

In [16]:
def bqCreateDataset(dataset_name):
    
    dataset_ref = bq_client.dataset(dataset_name)

    try:
        return bq_client.get_dataset(dataset_ref).dataset_id
    except:
        dataset = bigquery.Dataset(dataset_ref)
        dataset = bq_client.create_dataset(dataset)
        print('Dataset {} created.'.format(dataset.dataset_id))
        return dataset.dataset_id
In [17]:
def bqCreateTable(dataset_id, 
                  table_name,):
    """
    Create main table with all cases and the medical text.
    return:
        table_id
    """
    dataset_ref = bq_client.dataset(dataset_id)

    # Prepares a reference to the table
    table_ref = dataset_ref.table(table_name)

    try:
        return bq_client.get_table(table_ref).table_id
    except:
        schema = [
            bigquery.SchemaField('case', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('it_raw_txt', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('eng_raw_txt', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('eng_txt', 'STRING', mode='REQUIRED', 
                                 description='Output of preprocessing pipeline.')]
        table = bigquery.Table(table_ref, schema=schema)
        table = bq_client.create_table(table)
        print('table {} created.'.format(table.table_id))
        return table.table_id
In [18]:
def exportItems2BQ(dataset_id, table_id, case,
                   it_raw_blob, eng_raw_blob, curated_eng_blob):

    # Prepares a reference to the dataset
    dataset_ref = bq_client.dataset(dataset_id)

    table_ref = dataset_ref.table(table_id)
    table = bq_client.get_table(table_ref)  # API call
    
    # Download text from GCS
    it_raw_txt_string = it_raw_blob.download_as_string().decode('utf-8')
    eng_raw_txt_string = eng_raw_blob.download_as_string().decode('utf-8')
    curated_eng_string = curated_eng_blob.download_as_string().decode('utf-8')
    
    rows_to_insert = [{'case': case,
                      'it_raw_txt': it_raw_txt_string,
                      'eng_raw_txt': eng_raw_txt_string,
                      'eng_txt': curated_eng_string
                     }]
    errors = bq_client.insert_rows(table, rows_to_insert)  # API request
    assert errors == []
    print('{} was added to {} dataset, specifically in {} table.'.format(case,
                                                                        dataset_id,
                                                                        table_id))
In [19]:
bucket_name = 'covid19-public-dataset-aketari'
gcs_source_prefix = 'raw_txt'
dataset_id = bqCreateDataset('covid19')
table_id = bqCreateTable(dataset_id, 'ISMIR_cases')

lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
                                      prefix=gcs_source_prefix)


for blob in lst_blobs:
    doc_title = blob.name.split('/')[-1].split('.txt')[0]
    
    # download as string
    # it_raw_txt = gs://bucket_name/
    it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
    
    # eng_raw_txt = gs://covid19-aziz/text/[...]covid19-aziz_text_raw_txt_{doc_title}_en_translations.txt
    path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
                                                                              bucket_name,
                                                                              doc_title)
    eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
    
    # curated_eng_txt = gs://covid19-aziz/text/curated_eng_txt/case1.txt
    curated_eng_blob = storage_client.get_bucket(bucket_name)\
                                    .get_blob('curated_eng_txt/{}.txt'.format(doc_title))
    
    # populate to BQ dataset
    exportItems2BQ(dataset_id, table_id, doc_title,
                   it_raw_blob, eng_raw_blob, curated_eng_blob)
Dataset covid19 created.
table ISMIR_cases created.
case1 was added to covid19 dataset, specifically in ISMIR_cases table.
case10 was added to covid19 dataset, specifically in ISMIR_cases table.
case11 was added to covid19 dataset, specifically in ISMIR_cases table.
case12 was added to covid19 dataset, specifically in ISMIR_cases table.
case13 was added to covid19 dataset, specifically in ISMIR_cases table.
case14 was added to covid19 dataset, specifically in ISMIR_cases table.
case15 was added to covid19 dataset, specifically in ISMIR_cases table.
case16 was added to covid19 dataset, specifically in ISMIR_cases table.
case17 was added to covid19 dataset, specifically in ISMIR_cases table.
case18 was added to covid19 dataset, specifically in ISMIR_cases table.
case19 was added to covid19 dataset, specifically in ISMIR_cases table.
case2 was added to covid19 dataset, specifically in ISMIR_cases table.
case20 was added to covid19 dataset, specifically in ISMIR_cases table.
case21 was added to covid19 dataset, specifically in ISMIR_cases table.
case22 was added to covid19 dataset, specifically in ISMIR_cases table.
case23 was added to covid19 dataset, specifically in ISMIR_cases table.
case24 was added to covid19 dataset, specifically in ISMIR_cases table.
case25 was added to covid19 dataset, specifically in ISMIR_cases table.
case26 was added to covid19 dataset, specifically in ISMIR_cases table.
case27 was added to covid19 dataset, specifically in ISMIR_cases table.
case28 was added to covid19 dataset, specifically in ISMIR_cases table.
case29 was added to covid19 dataset, specifically in ISMIR_cases table.
case3 was added to covid19 dataset, specifically in ISMIR_cases table.
case30 was added to covid19 dataset, specifically in ISMIR_cases table.
case31 was added to covid19 dataset, specifically in ISMIR_cases table.
case32 was added to covid19 dataset, specifically in ISMIR_cases table.
case33 was added to covid19 dataset, specifically in ISMIR_cases table.
case34 was added to covid19 dataset, specifically in ISMIR_cases table.
case35 was added to covid19 dataset, specifically in ISMIR_cases table.
case36 was added to covid19 dataset, specifically in ISMIR_cases table.
case37 was added to covid19 dataset, specifically in ISMIR_cases table.
case38 was added to covid19 dataset, specifically in ISMIR_cases table.
case39 was added to covid19 dataset, specifically in ISMIR_cases table.
case4 was added to covid19 dataset, specifically in ISMIR_cases table.
case40 was added to covid19 dataset, specifically in ISMIR_cases table.
case41 was added to covid19 dataset, specifically in ISMIR_cases table.
case42 was added to covid19 dataset, specifically in ISMIR_cases table.
case43 was added to covid19 dataset, specifically in ISMIR_cases table.
case44 was added to covid19 dataset, specifically in ISMIR_cases table.
case45 was added to covid19 dataset, specifically in ISMIR_cases table.
case46 was added to covid19 dataset, specifically in ISMIR_cases table.
case47 was added to covid19 dataset, specifically in ISMIR_cases table.
case48 was added to covid19 dataset, specifically in ISMIR_cases table.
case49 was added to covid19 dataset, specifically in ISMIR_cases table.
case5 was added to covid19 dataset, specifically in ISMIR_cases table.
case6 was added to covid19 dataset, specifically in ISMIR_cases table.
case7 was added to covid19 dataset, specifically in ISMIR_cases table.
case8 was added to covid19 dataset, specifically in ISMIR_cases table.
case9 was added to covid19 dataset, specifically in ISMIR_cases table.
In [45]:
def returnQueryResults(bq_client, project_id, dataset_id, table_id, case_id):
    """

    Args:
        bq_client:
        project_id:
        dataset_id:
        table_id:
        case_id:

    Returns:

    """

    query = ('SELECT * FROM `{}.{}.{}` WHERE `case`="{}" LIMIT 1'.format(project_id, dataset_id, table_id, case_id))

    try:
        query_job = bq_client.query(query)
        is_exist = len(list(query_job.result())) >= 1
        logging.info('Query case id: {}'.format(case_id) if is_exist \
                         else "Case id: {} does NOT exist".format(case_id))
        print (list(query_job.result()))
    except Exception as e:
        logging.error("Error", e)
In [46]:
returnQueryResults(bq_client, project_id, 'covid19', 'ISMIR_cases', 'case1')
[Row(('case1', 'COVID-19: caso 1\na cura di:\nA. Borgheresi, A. Agostini, L. Ottaviani, C. Floridi, A. Giovagnoni\nDi p a r t i m en t o d i S c i e n ze Ra d i o l o g ic h e ‒ S c u o l a d i Speciali zzazione in Radi ologia\nUniversità Politecnica delle Marche ‒ Ancona (Italy)\nAzienda Ospedali Riuniti ‒ Torrette\nANCONA\nHRTC di un uomo di 80 anni con dispnea e febbre risultato positivo per COVID-19; esame eseguito a 5 giorni dallʼesordio.\n A\nImmagine A: ricostruzione con algoritmo Lung, immagine\nassiale. Si\napprezzano multiple opacità dja “vetro\nsmerigliato“00 cui si associa, in particolare ai lobi polmonari\ninferiori, ispessimento dei setti interlobulari con alispetto a\n"crazy paviſing o" cerchio nero). È anche presente addensamento\nlineare a distribuzione mantellare-subpleurica (freccia nera\npiena).\n B\nImmagine B: Ricostruzione coronale che mostra la distribuzione\nprevalentemente periferica delle opacità a "vetro smerigliato“\n(frecce nere vuote).\n ', 'COVID-19: case 1\nby:\nA. Borgheresi, A. Agostini, L. Ottaviani, C. Floridi, A. Giovagnoni\nDi p a r t i m en t o d i S c i e n ze Ra d i o l o g ic h e - S c u o l a d i Specialization in Radiology\nPolytechnic University of Marche - Ancona (Italy)\nAzienda Ospedali Riuniti - Torrette\nANCONA\nHRTC of an 80-year-old man with dyspnoea and fever tested positive for COVID-19; exam performed 5 days from the onset.\nTO\nImage A: reconstruction with Lung algorithm, image\naxial. Yes\nappreciate multiple opacities dja “glass\nfrosted “00 with which it is associated, in particular with the lung lobes\nlower, thickening of the interlobular septa with alispect a\n"crazy paviſing o" black circle). There is also thickening\nlinear with mantle-subpleural distribution (black arrow\nfull).\nB\nImage B: Coronal reconstruction showing the distribution\nmainly peripheral of "frosted glass" opacities\n(empty black arrows).\n', ' COVID 19 1 by A Borgheresi A Agostini L Ottaviani C Floridi A Giovagnoni Di S Ra S Specialization in Radiology Polytechnic University Marche Ancona Italy Azienda Ospedali Riuniti Torrette ANCONA HRTC an 80 year old man with dyspnoea and fever tested positive for COVID 19 exam performed 5 days from the onset TO Image A reconstruction with Lung algorithm image axial Yes appreciate multiple opacities dja glass frosted 00 with which it is associated in particular with the lung lobes lower thickening the interlobular septa with alispect crazy pavi ing black circle There is also thickening linear with mantle subpleural distribution black arrow full B Image B Coronal reconstruction showing the distribution mainly peripheral frosted glass opacities empty black arrows'), {'case': 0, 'it_raw_txt': 1, 'eng_raw_txt': 2, 'eng_txt': 3})]
Out[46]:
False

Upload to Datastore

In [3]:
# https://www.kdnuggets.com/2019/04/text-preprocessing-nlp-machine-learning.html
# Load model
# en_ner_bionlp13cg_md or en_core_sci_lg
#nlp = spacy.load("en_core_sci_lg")

nlp = en_core_sci_lg.load()
In [4]:
# Add pipe features to pipeline 
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)

# Add the abbreviation pipe to the spacy pipeline.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py:318: UserWarning: Trying to unpickle estimator TfidfTransformer from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.
  UserWarning)
/opt/conda/lib/python3.7/site-packages/sklearn/base.py:318: UserWarning: Trying to unpickle estimator TfidfVectorizer from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.
  UserWarning)
In [5]:
def medicalEntityExtraction(doc):
    # convert text to vector
    display_text = displacy.render(doc,jupyter=True,style='ent')
    annotated_entities = set([(X.text, X.label_) for X in doc.ents])
    return display_text, annotated_entities
In [6]:
def addTask(client, entities_dict):
    key = client.key('case', doc_title)
    task = datastore.Entity(key=key)
    task.update(
        entities_dict)
    client.put(task)
    # Then get by key for this entity
    return client.get(key)
In [27]:
 
In [15]:
# list of blobs
bucket_name = 'covid19-public-dataset-aketari'
gcs_source_prefix = 'curated_eng_txt'
lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
                                      prefix=gcs_source_prefix)


for blob in lst_blobs:
    doc_title = blob.name.split('/')[-1].split('.txt')[0]
    
    # download as string
    eng_string = blob.download_as_string().decode('utf-8')
    
    # convert to vector
    doc = nlp(eng_string)
    
    # Extract medical entities
    pattern = 'T(\d{3})'

    UMLS_tuis_entity = {}
    entity_dict = {}


    for idx in range(len(doc.ents)):
        entity = doc.ents[idx]
        entity_dict[entity] = ''
        for umls_ent in entity._.umls_ents:
            entity_dict[entity] = linker.umls.cui_to_entity[umls_ent[0]]

        tui = re.search(pattern, str(entity_dict[entity]))
        if tui:
            UMLS_tuis_entity[str(entity)] = tui.group()
        else:
            UMLS_tuis_entity[str(entity)] = None
            
    # generate dataframes
    entities = list(UMLS_tuis_entity.keys())
    TUIs = list(UMLS_tuis_entity.values())
    df_entities = pd.DataFrame(data={'entity':entities,'TUIs':TUIs})
    df_reference_TUIs = pd.read_csv('./data/UMLS_tuis.csv')
    df_annotated_text_entities = pd.merge(df_entities,df_reference_TUIs,how='inner',on=['TUIs'])
    
    # upload entities to datastore
    entities_dict = {}
    for idx in range(df_annotated_text_entities.shape[0]):
        
        category = df_annotated_text_entities.iloc[idx].values[2]
        
        #TUI = df_annotated_text_entities.iloc[idx].values[1]
        #entities_dict[category].append(TUI)
        
        med_entity = df_annotated_text_entities.iloc[idx].values[0]
        
        
        # Append to list of entities if the key,value pair already exist
        try:
            entities_dict[category].append(med_entity)
            
        except:
            entities_dict[category] = []
            entities_dict[category].append(med_entity)
        
        
        # API call
    key = addTask(datastore_client, entities_dict)
    print('The upload of {} entities is done.'.format(doc_title))
The upload of case1 entities is done.
The upload of case10 entities is done.
The upload of case11 entities is done.
The upload of case12 entities is done.
The upload of case13 entities is done.
The upload of case14 entities is done.
The upload of case15 entities is done.
The upload of case16 entities is done.
The upload of case17 entities is done.
The upload of case18 entities is done.
The upload of case19 entities is done.
The upload of case2 entities is done.
The upload of case20 entities is done.
The upload of case21 entities is done.
The upload of case22 entities is done.
The upload of case23 entities is done.
The upload of case24 entities is done.
The upload of case25 entities is done.
The upload of case26 entities is done.
The upload of case27 entities is done.
The upload of case28 entities is done.
The upload of case29 entities is done.
The upload of case3 entities is done.
The upload of case30 entities is done.
The upload of case31 entities is done.
The upload of case32 entities is done.
The upload of case33 entities is done.
The upload of case34 entities is done.
The upload of case35 entities is done.
The upload of case36 entities is done.
The upload of case37 entities is done.
The upload of case38 entities is done.
The upload of case39 entities is done.
The upload of case4 entities is done.
The upload of case40 entities is done.
The upload of case41 entities is done.
The upload of case42 entities is done.
The upload of case43 entities is done.
The upload of case44 entities is done.
The upload of case45 entities is done.
The upload of case46 entities is done.
The upload of case47 entities is done.
The upload of case48 entities is done.
The upload of case49 entities is done.
The upload of case5 entities is done.
The upload of case6 entities is done.
The upload of case7 entities is done.
The upload of case8 entities is done.
The upload of case9 entities is done.
In [8]:
df_annotated_text_entities.head()
Out[8]:
entity TUIs Categories
0 HRTC T059 Laboratory Procedure
1 year T079 Temporal Concept
2 man T047 Disease or Syndrome
3 dyspnoea T047 Disease or Syndrome
4 fever T109 Organic Chemical
In [21]:
def getCases(datastore_client, filter_dict, limit=10):
    query = datastore_client.query(kind='case')
    
    for key,values in filter_dict.items():
        for value in values:
            query.add_filter(key, '=', value)
    results = list(query.fetch(limit=limit))
    return results
In [25]:
filter_dict = {'Sign or Symptom':['onset symptoms', "chills"]}
getCases(datastore_client,filter_dict)
Out[25]:
[<Entity('case', 'case39') {'Quantitative Concept': ['results', 'multiple areas', 'density'], 'Therapeutic or Preventive Procedure': ['dyspnea', 'increased'], 'Disease or Syndrome': ['SaO', 'interstitial pneumonia'], 'Biomedical or Dental Material': ['swab'], 'Gene or Genome': ['PaCO'], 'Mental Process': ['transferred'], 'Cell Component': ['ter ter apia'], 'Entity': ['productive'], 'Body Location or Region': ['Chest CT', 'pharyngeal swab'], 'Spatial Concept': ['externally'], 'Intellectual Product': ['nausea', 'loss appetite', 'altered', 'L', 'CT', 'patent'], 'Population Group': ['woman'], 'Medical Device': ['blood sampling', 'L E erythrocyte sedimentation'], 'Qualitative Concept': ['associated in'], 'Tissue': ['lung parenchyma type frosted'], 'Amino Acid, Peptide, or Protein': ['Lactate Dehydrogenase'], 'Laboratory or Test Result': ['laboratory tests'], 'Idea or Concept': ['patient', 'Patient'], 'Laboratory Procedure': ['normal blood count', 'blood gas analysis'], 'Temporal Concept': ['year'], 'Body Space or Junction': ['ambient'], 'Research Activity': ['Diagnostic Imaging'], 'Virus': ['associated areas'], 'Geographic Area': ['Rome'], 'Functional Concept': ['otic'], 'Activity': ['lower lobes'], 'Sign or Symptom': ['chills', 'onset symptoms'], 'Body Part, Organ, or Organ Component': ['posterior regions', 'ectatic bronchi'], 'Finding': ['negative', 'days', 'pH', 'suspicion', 'positive'], 'Professional or Occupational Group': ['Radiologists'], 'Congenital Abnormality': ['absence']}>]
In [ ]: