{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from google.cloud import vision\n", "from google.cloud import storage\n", "from google.protobuf import json_format\n", "from google.cloud import translate\n", "from google.cloud import bigquery\n", "from google.cloud import datastore\n", "import logging\n", "\n", "import re\n", "import time\n", "import pandas as pd\n", "import numpy as np\n", "\n", "#!sudo pip3 install scispacy\n", "import scispacy\n", "import spacy\n", "from spacy import displacy\n", "#https://github.com/explosion/spacy-models/releases/download/en_core_sci_sm-2.2.0/en_core_sci_sm-2.2.0.tar.gz\n", "#https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz\n", "import en_core_sci_sm, en_core_sci_lg #, en_ner_bionlp13cg_md\n", "from scispacy.umls_linking import UmlsEntityLinker\n", "from scispacy.abbreviation import AbbreviationDetector" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "References:\n", "- ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "project_id = \"pm-preparation\"\n", "location = \"us-central1\"\n", "storage_client = storage.Client()\n", "vision_client = vision.ImageAnnotatorClient()\n", "translate_client = translate.TranslationServiceClient()\n", "datastore_client = datastore.Client()\n", "bq_client = bigquery.Client()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def async_detect_document(vision_client, gcs_source_uri, gcs_destination_uri):\n", " \"\"\"OCR with PDF/TIFF as source files on GCS\"\"\"\n", " doc_title = gcs_source_uri.split('/')[-1].split('.pdf')[0]\n", " \n", " # Supported mime_types are: 'application/pdf' and 'image/tiff'\n", " mime_type = 'application/pdf'\n", "\n", " # How many pages should be grouped into each json output file.\n", " batch_size= 20\n", "\n", " feature = vision.types.Feature(\n", " type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)\n", "\n", " gcs_source = vision.types.GcsSource(uri=gcs_source_uri)\n", " input_config = vision.types.InputConfig(\n", " gcs_source=gcs_source, mime_type=mime_type)\n", "\n", " gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)\n", " output_config = vision.types.OutputConfig(\n", " gcs_destination=gcs_destination, batch_size=batch_size)\n", "\n", " async_request = vision.types.AsyncAnnotateFileRequest(\n", " features=[feature], input_config=input_config,\n", " output_config=output_config)\n", "\n", " operation = vision_client.async_batch_annotate_files(\n", " requests=[async_request])\n", "\n", " #print('Waiting for the operation to finish.')\n", " operation.result(timeout=180)\n", " print('Text extraction from document {} is completed.'.format(doc_title))\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Once the request has completed and the output has been\n", "# written to GCS, we can list all the output files.\n", "def read_json_result(json_gcs_path, doc_title):\n", " gcs_destination_prefix = 'json/' + '{}-'.format(doc_title)\n", " \n", " # List objects with the given prefix.\n", " blob_list = list(storage_client.list_blobs(bucket_or_name=bucket_name,\n", " prefix=gcs_destination_prefix))\n", " all_text = ''\n", " for blob in blob_list:\n", "\n", " json_string = blob.download_as_string()\n", " response = json_format.Parse(\n", " json_string, vision.types.AnnotateFileResponse())\n", "\n", " # The actual response for the first page of the input file.\n", " \n", " for response in response.responses: \n", " #first_page_response = response.responses[0]\n", " text_response = response.full_text_annotation.text\n", " all_text += text_response\n", " all_text += ' '\n", "\n", " print(\"Parsed json doc: {}\".format(doc_title))\n", " return all_text" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def upload_blob(txt_content, destination_blob_name):\n", " \"\"\"Uploads a file to the bucket.\"\"\"\n", " destination_blob_name = destination_blob_name.split('gs://{}/'.format(bucket_name))[-1]\n", " bucket = storage_client.bucket(bucket_name)\n", " blob = bucket.blob(destination_blob_name)\n", "\n", " blob.upload_from_string(txt_content)\n", "\n", " print(\"Text uploaded to {}\".format(destination_blob_name))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def batch_translate_text(\n", " input_uri=\"gs://YOUR_BUCKET_ID/path/to/your/file.txt\",\n", " output_uri=\"gs://YOUR_BUCKET_ID/path/to/save/results/\"):\n", " \"\"\"Translates a batch of texts on GCS and stores the result in a GCS location.\"\"\"\n", "\n", " # Supported file types: https://cloud.google.com/translate/docs/supported-formats\n", " gcs_source = {\"input_uri\": input_uri}\n", "\n", " input_configs_element = {\n", " \"gcs_source\": gcs_source,\n", " \"mime_type\": \"text/plain\" # Can be \"text/plain\" or \"text/html\".\n", " }\n", " gcs_destination = {\"output_uri_prefix\": output_uri}\n", " output_config = {\"gcs_destination\": gcs_destination}\n", " parent = translate_client.location_path(project_id, location)\n", "\n", " # Supported language codes: https://cloud.google.com/translate/docs/language\n", " operation = translate_client.batch_translate_text(\n", " parent=parent,\n", " source_language_code=\"it\",\n", " target_language_codes=[\"en\"], # Up to 10 language codes here.\n", " input_configs=[input_configs_element],\n", " output_config=output_config)\n", "\n", " response = operation.result(180)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def removePunctuation(string): \n", " \n", " # punctuation marks \n", " punctuations = '''!()-[]{};:'\"\\,<>./?@#$%^&*_~'''\n", " \n", " # traverse the given string and if any punctuation \n", " # marks occur replace it with null \n", " for x in string.lower(): \n", " if x in punctuations: \n", " string = string.replace(x, \"\") \n", " \n", " # Print string without punctuation \n", " return string" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**This step will take 1hr and 20 min approx**" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "customize_stop_words = [\n", " 'uoc', 'diagnostic', 'interventional', 'radiology', 'madonna', 'delle', 'grazie', 'hospital',\n", " 'Borgheresi', 'Agostini', 'Ottaviani', 'Floridi', 'Giovagnoni', 'di', 'specialization',\n", " 'Polytechnic', 'University', 'marche', 'ANCONA', 'Italy', 'Azienda', 'Ospedali', \n", " 'Riuniti', 'Yorrette', 'Matera', 'Michele', 'Nardella', 'Gerardo', 'Costanzo',\n", " 'Claudia', 'Lopez', 'st', 'a.', 'a', 'of', 's', 'cien', 'ze', 'diolog', 'ic', 'he',\n", " 'â', '€','s','b','case','Cuoladi','l','c','ra','bergamo','patelli','est','asst',\n", " 'dr','Dianluigi', 'Svizzero','i','riccardo','Alessandro','Spinazzola','angelo',\n", " 'maggiore', 'p' ,'r' ,'t', 'm', 'en', 't', 'o', 'd', 'e', 'n', 'd', 'o', 'g', 'h', 'u'\n", "]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parsed json doc: case1\n", "Text uploaded to raw_txt/case1.txt\n", "Text uploaded to curated_eng_txt/case1.txt\n", "case1 processing is done.\n", "Parsed json doc: case10\n", "Text uploaded to raw_txt/case10.txt\n", "Text uploaded to curated_eng_txt/case10.txt\n", "case10 processing is done.\n", "Parsed json doc: case11\n", "Text uploaded to raw_txt/case11.txt\n", "Text uploaded to curated_eng_txt/case11.txt\n", "case11 processing is done.\n", "Parsed json doc: case12\n", "Text uploaded to raw_txt/case12.txt\n", "Text uploaded to curated_eng_txt/case12.txt\n", "case12 processing is done.\n", "Parsed json doc: case13\n", "Text uploaded to raw_txt/case13.txt\n", "Text uploaded to curated_eng_txt/case13.txt\n", "case13 processing is done.\n", "Parsed json doc: case14\n", "Text uploaded to raw_txt/case14.txt\n", "Text uploaded to curated_eng_txt/case14.txt\n", "case14 processing is done.\n", "Parsed json doc: case15\n", "Text uploaded to raw_txt/case15.txt\n", "Text uploaded to curated_eng_txt/case15.txt\n", "case15 processing is done.\n", "Parsed json doc: case16\n", "Text uploaded to raw_txt/case16.txt\n", "Text uploaded to curated_eng_txt/case16.txt\n", "case16 processing is done.\n", "Parsed json doc: case17\n", "Text uploaded to raw_txt/case17.txt\n", "Text uploaded to curated_eng_txt/case17.txt\n", "case17 processing is done.\n", "Parsed json doc: case18\n", "Text uploaded to raw_txt/case18.txt\n", "Text uploaded to curated_eng_txt/case18.txt\n", "case18 processing is done.\n", "Parsed json doc: case19\n", "Text uploaded to raw_txt/case19.txt\n", "Text uploaded to curated_eng_txt/case19.txt\n", "case19 processing is done.\n", "Parsed json doc: case2\n", "Text uploaded to raw_txt/case2.txt\n", "Text uploaded to curated_eng_txt/case2.txt\n", "case2 processing is done.\n", "Parsed json doc: case20\n", "Text uploaded to raw_txt/case20.txt\n", "Text uploaded to curated_eng_txt/case20.txt\n", "case20 processing is done.\n", "Parsed json doc: case21\n", "Text uploaded to raw_txt/case21.txt\n", "Text uploaded to curated_eng_txt/case21.txt\n", "case21 processing is done.\n", "Parsed json doc: case22\n", "Text uploaded to raw_txt/case22.txt\n", "Text uploaded to curated_eng_txt/case22.txt\n", "case22 processing is done.\n", "Parsed json doc: case23\n", "Text uploaded to raw_txt/case23.txt\n", "Text uploaded to curated_eng_txt/case23.txt\n", "case23 processing is done.\n", "Parsed json doc: case24\n", "Text uploaded to raw_txt/case24.txt\n", "Text uploaded to curated_eng_txt/case24.txt\n", "case24 processing is done.\n", "Parsed json doc: case25\n", "Text uploaded to raw_txt/case25.txt\n", "Text uploaded to curated_eng_txt/case25.txt\n", "case25 processing is done.\n", "Parsed json doc: case26\n", "Text uploaded to raw_txt/case26.txt\n", "Text uploaded to curated_eng_txt/case26.txt\n", "case26 processing is done.\n", "Parsed json doc: case27\n", "Text uploaded to raw_txt/case27.txt\n", "Text uploaded to curated_eng_txt/case27.txt\n", "case27 processing is done.\n", "Parsed json doc: case28\n", "Text uploaded to raw_txt/case28.txt\n", "Text uploaded to curated_eng_txt/case28.txt\n", "case28 processing is done.\n", "Parsed json doc: case29\n", "Text uploaded to raw_txt/case29.txt\n", "Text uploaded to curated_eng_txt/case29.txt\n", "case29 processing is done.\n", "Parsed json doc: case3\n", "Text uploaded to raw_txt/case3.txt\n", "Text uploaded to curated_eng_txt/case3.txt\n", "case3 processing is done.\n", "Parsed json doc: case30\n", "Text uploaded to raw_txt/case30.txt\n", "Text uploaded to curated_eng_txt/case30.txt\n", "case30 processing is done.\n", "Parsed json doc: case31\n", "Text uploaded to raw_txt/case31.txt\n", "Text uploaded to curated_eng_txt/case31.txt\n", "case31 processing is done.\n", "Parsed json doc: case32\n", "Text uploaded to raw_txt/case32.txt\n", "Text uploaded to curated_eng_txt/case32.txt\n", "case32 processing is done.\n", "Parsed json doc: case33\n", "Text uploaded to raw_txt/case33.txt\n", "Text uploaded to curated_eng_txt/case33.txt\n", "case33 processing is done.\n", "Parsed json doc: case34\n", "Text uploaded to raw_txt/case34.txt\n", "Text uploaded to curated_eng_txt/case34.txt\n", "case34 processing is done.\n", "Parsed json doc: case35\n", "Text uploaded to raw_txt/case35.txt\n", "Text uploaded to curated_eng_txt/case35.txt\n", "case35 processing is done.\n", "Parsed json doc: case36\n", "Text uploaded to raw_txt/case36.txt\n", "Text uploaded to curated_eng_txt/case36.txt\n", "case36 processing is done.\n", "Parsed json doc: case37\n", "Text uploaded to raw_txt/case37.txt\n", "Text uploaded to curated_eng_txt/case37.txt\n", "case37 processing is done.\n", "Parsed json doc: case38\n", "Text uploaded to raw_txt/case38.txt\n", "Text uploaded to curated_eng_txt/case38.txt\n", "case38 processing is done.\n", "Parsed json doc: case39\n", "Text uploaded to raw_txt/case39.txt\n", "Text uploaded to curated_eng_txt/case39.txt\n", "case39 processing is done.\n", "Parsed json doc: case4\n", "Text uploaded to raw_txt/case4.txt\n", "Text uploaded to curated_eng_txt/case4.txt\n", "case4 processing is done.\n", "Parsed json doc: case40\n", "Text uploaded to raw_txt/case40.txt\n", "Text uploaded to curated_eng_txt/case40.txt\n", "case40 processing is done.\n", "Parsed json doc: case41\n", "Text uploaded to raw_txt/case41.txt\n", "Text uploaded to curated_eng_txt/case41.txt\n", "case41 processing is done.\n", "Parsed json doc: case42\n", "Text uploaded to raw_txt/case42.txt\n", "Text uploaded to curated_eng_txt/case42.txt\n", "case42 processing is done.\n", "Parsed json doc: case43\n", "Text uploaded to raw_txt/case43.txt\n", "Text uploaded to curated_eng_txt/case43.txt\n", "case43 processing is done.\n", "Parsed json doc: case44\n", "Text uploaded to raw_txt/case44.txt\n", "Text uploaded to curated_eng_txt/case44.txt\n", "case44 processing is done.\n", "Parsed json doc: case45\n", "Text uploaded to raw_txt/case45.txt\n", "Text uploaded to curated_eng_txt/case45.txt\n", "case45 processing is done.\n", "Parsed json doc: case46\n", "Text uploaded to raw_txt/case46.txt\n", "Text uploaded to curated_eng_txt/case46.txt\n", "case46 processing is done.\n", "Parsed json doc: case47\n", "Text uploaded to raw_txt/case47.txt\n", "Text uploaded to curated_eng_txt/case47.txt\n", "case47 processing is done.\n", "Parsed json doc: case48\n", "Text uploaded to raw_txt/case48.txt\n", "Text uploaded to curated_eng_txt/case48.txt\n", "case48 processing is done.\n", "Parsed json doc: case49\n", "Text uploaded to raw_txt/case49.txt\n", "Text uploaded to curated_eng_txt/case49.txt\n", "case49 processing is done.\n", "Parsed json doc: case5\n", "Text uploaded to raw_txt/case5.txt\n", "Text uploaded to curated_eng_txt/case5.txt\n", "case5 processing is done.\n", "Parsed json doc: case6\n", "Text uploaded to raw_txt/case6.txt\n", "Text uploaded to curated_eng_txt/case6.txt\n", "case6 processing is done.\n", "Parsed json doc: case7\n", "Text uploaded to raw_txt/case7.txt\n", "Text uploaded to curated_eng_txt/case7.txt\n", "case7 processing is done.\n", "Parsed json doc: case8\n", "Text uploaded to raw_txt/case8.txt\n", "Text uploaded to curated_eng_txt/case8.txt\n", "case8 processing is done.\n", "Parsed json doc: case9\n", "Text uploaded to raw_txt/case9.txt\n", "Text uploaded to curated_eng_txt/case9.txt\n", "case9 processing is done.\n" ] } ], "source": [ "# Process documents\n", "\n", "bucket_name = 'covid19-public-dataset-aketari'\n", "gcs_source_prefix = 'pdf'\n", "lst_pdf_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n", " prefix='pdf')\n", "\n", "lst_json_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n", " prefix='json')\n", "\n", "overall_start_time = time.time()\n", "for blob in lst_pdf_blobs:\n", " doc_title = blob.name.split('/')[-1].split('.pdf')[0]\n", " #files_metadata[doc_title] = {}\n", "\n", " # Generate all paths\n", " gcs_source_path = 'gs://' + bucket_name +'/' + blob.name\n", "\n", " #start_time = time.time()\n", " # OCR pdf documents\n", " async_detect_document(vision_client, \n", " gcs_source_path,\n", " json_gcs_dest_path)\n", "print ('OCR done.')\n", " \n", "for blob in lst_json_blobs:\n", " doc_title = blob.name.split('/')[-1].split('-')[0]\n", " \n", " json_gcs_dest_path = 'gs://' + bucket_name + '/{}'.format(blob.name)\n", " txt_gcs_dest_path = 'gs://' + bucket_name + '/raw_txt/' + doc_title + '.txt'\n", " eng_txt_gcs_dest_path = 'gs://' + bucket_name + '/eng_txt/{}/'.format(doc_title)\n", " processed_eng_gcs_dest_path = 'gs://' + bucket_name + '/curated_eng_txt/' + doc_title + '.txt'\n", " \n", " # Parse json\n", " all_text = read_json_result(json_gcs_dest_path, doc_title)\n", " #files_metadata[doc_title]['text'] = all_text\n", "\n", " # Upload raw text to GCS\n", " upload_blob(all_text, txt_gcs_dest_path)\n", "\n", " # Translate raw text to english\n", " batch_translate_text(input_uri = txt_gcs_dest_path,\n", " output_uri = eng_txt_gcs_dest_path)\n", " \n", " # Process eng raw text\n", " blob_prefix = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,\n", " bucket_name,\n", " doc_title)\n", " \n", " eng_blob = storage_client.get_bucket(bucket_name).get_blob(blob_prefix)\n", " eng_raw_string = eng_blob.download_as_string().decode('utf-8')\n", " \n", " # lowercase\n", " #sample_text = eng_raw_string.lower()\n", "\n", " # Remove dates\n", " # 1 or 2 digit number followed by back slash followed by 1 or 2 digit number ...\n", " pattern_dates = '(\\d{1,2})/(\\d{1,2})/(\\d{4})'\n", " pattern_fig = 'Figure (\\d{1,2})' \n", " pattern_image = '^Image .$'\n", " replace = ''\n", "\n", " eng_raw_string = re.sub(pattern_dates, replace, eng_raw_string) \n", " eng_raw_string = re.sub(pattern_fig, replace, eng_raw_string)\n", " eng_raw_string = re.sub(pattern_image, replace, eng_raw_string) \n", " \n", " # remove punctuation and special characters\n", " eng_raw_string = re.sub('[^A-Za-z0-9]+', ' ', eng_raw_string)\n", "\n", " # Remove custom stop words\n", " tokens = [token for token in eng_raw_string.split()if token not in customize_stop_words]\n", "\n", " refined_doc = ''\n", " for word in tokens:\n", " refined_doc += ' {}'.format(word)\n", " \n", " # Upload raw text to GCS\n", " upload_blob(refined_doc, processed_eng_gcs_dest_path)\n", "\n", " #print('refinement completed')\n", " print('{} processing is done.'.format(doc_title))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Upload to bigquery" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def bqCreateDataset(dataset_name):\n", " \n", " dataset_ref = bq_client.dataset(dataset_name)\n", "\n", " try:\n", " return bq_client.get_dataset(dataset_ref).dataset_id\n", " except:\n", " dataset = bigquery.Dataset(dataset_ref)\n", " dataset = bq_client.create_dataset(dataset)\n", " print('Dataset {} created.'.format(dataset.dataset_id))\n", " return dataset.dataset_id" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def bqCreateTable(dataset_id, \n", " table_name,):\n", " \"\"\"\n", " Create main table with all cases and the medical text.\n", " return:\n", " table_id\n", " \"\"\"\n", " dataset_ref = bq_client.dataset(dataset_id)\n", "\n", " # Prepares a reference to the table\n", " table_ref = dataset_ref.table(table_name)\n", "\n", " try:\n", " return bq_client.get_table(table_ref).table_id\n", " except:\n", " schema = [\n", " bigquery.SchemaField('case', 'STRING', mode='REQUIRED'),\n", " bigquery.SchemaField('it_raw_txt', 'STRING', mode='REQUIRED'),\n", " bigquery.SchemaField('eng_raw_txt', 'STRING', mode='REQUIRED'),\n", " bigquery.SchemaField('eng_txt', 'STRING', mode='REQUIRED', \n", " description='Output of preprocessing pipeline.')]\n", " table = bigquery.Table(table_ref, schema=schema)\n", " table = bq_client.create_table(table)\n", " print('table {} created.'.format(table.table_id))\n", " return table.table_id" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def exportItems2BQ(dataset_id, table_id, case,\n", " it_raw_blob, eng_raw_blob, curated_eng_blob):\n", "\n", " # Prepares a reference to the dataset\n", " dataset_ref = bq_client.dataset(dataset_id)\n", "\n", " table_ref = dataset_ref.table(table_id)\n", " table = bq_client.get_table(table_ref) # API call\n", " \n", " # Download text from GCS\n", " it_raw_txt_string = it_raw_blob.download_as_string().decode('utf-8')\n", " eng_raw_txt_string = eng_raw_blob.download_as_string().decode('utf-8')\n", " curated_eng_string = curated_eng_blob.download_as_string().decode('utf-8')\n", " \n", " rows_to_insert = [{'case': case,\n", " 'it_raw_txt': it_raw_txt_string,\n", " 'eng_raw_txt': eng_raw_txt_string,\n", " 'eng_txt': curated_eng_string\n", " }]\n", " errors = bq_client.insert_rows(table, rows_to_insert) # API request\n", " assert errors == []\n", " print('{} was added to {} dataset, specifically in {} table.'.format(case,\n", " dataset_id,\n", " table_id))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset covid19 created.\n", "table ISMIR_cases created.\n", "case1 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case10 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case11 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case12 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case13 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case14 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case15 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case16 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case17 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case18 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case19 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case2 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case20 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case21 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case22 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case23 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case24 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case25 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case26 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case27 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case28 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case29 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case3 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case30 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case31 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case32 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case33 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case34 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case35 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case36 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case37 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case38 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case39 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case4 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case40 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case41 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case42 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case43 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case44 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case45 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case46 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case47 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case48 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case49 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case5 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case6 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case7 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case8 was added to covid19 dataset, specifically in ISMIR_cases table.\n", "case9 was added to covid19 dataset, specifically in ISMIR_cases table.\n" ] } ], "source": [ "bucket_name = 'covid19-public-dataset-aketari'\n", "gcs_source_prefix = 'raw_txt'\n", "dataset_id = bqCreateDataset('covid19')\n", "table_id = bqCreateTable(dataset_id, 'ISMIR_cases')\n", "\n", "lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n", " prefix=gcs_source_prefix)\n", "\n", "\n", "for blob in lst_blobs:\n", " doc_title = blob.name.split('/')[-1].split('.txt')[0]\n", " \n", " # download as string\n", " # it_raw_txt = gs://bucket_name/\n", " it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))\n", " \n", " # eng_raw_txt = gs://covid19-aziz/text/[...]covid19-aziz_text_raw_txt_{doc_title}_en_translations.txt\n", " path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,\n", " bucket_name,\n", " doc_title)\n", " eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)\n", " \n", " # curated_eng_txt = gs://covid19-aziz/text/curated_eng_txt/case1.txt\n", " curated_eng_blob = storage_client.get_bucket(bucket_name)\\\n", " .get_blob('curated_eng_txt/{}.txt'.format(doc_title))\n", " \n", " # populate to BQ dataset\n", " exportItems2BQ(dataset_id, table_id, doc_title,\n", " it_raw_blob, eng_raw_blob, curated_eng_blob)\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def returnQueryResults(bq_client, project_id, dataset_id, table_id, case_id):\n", " \"\"\"\n", "\n", " Args:\n", " bq_client:\n", " project_id:\n", " dataset_id:\n", " table_id:\n", " case_id:\n", "\n", " Returns:\n", "\n", " \"\"\"\n", "\n", " query = ('SELECT * FROM `{}.{}.{}` WHERE `case`=\"{}\" LIMIT 1'.format(project_id, dataset_id, table_id, case_id))\n", "\n", " try:\n", " query_job = bq_client.query(query)\n", " is_exist = len(list(query_job.result())) >= 1\n", " logging.info('Query case id: {}'.format(case_id) if is_exist \\\n", " else \"Case id: {} does NOT exist\".format(case_id))\n", " print (list(query_job.result()))\n", " except Exception as e:\n", " logging.error(\"Error\", e)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(('case1', 'COVID-19: caso 1\\na cura di:\\nA. Borgheresi, A. Agostini, L. Ottaviani, C. Floridi, A. Giovagnoni\\nDi p a r t i m en t o d i S c i e n ze Ra d i o l o g ic h e ‒ S c u o l a d i Speciali zzazione in Radi ologia\\nUniversità Politecnica delle Marche ‒ Ancona (Italy)\\nAzienda Ospedali Riuniti ‒ Torrette\\nANCONA\\nHRTC di un uomo di 80 anni con dispnea e febbre risultato positivo per COVID-19; esame eseguito a 5 giorni dallʼesordio.\\n A\\nImmagine A: ricostruzione con algoritmo Lung, immagine\\nassiale. Si\\napprezzano multiple opacità dja “vetro\\nsmerigliato“00 cui si associa, in particolare ai lobi polmonari\\ninferiori, ispessimento dei setti interlobulari con alispetto a\\n\"crazy paviſing o\" cerchio nero). È anche presente addensamento\\nlineare a distribuzione mantellare-subpleurica (freccia nera\\npiena).\\n B\\nImmagine B: Ricostruzione coronale che mostra la distribuzione\\nprevalentemente periferica delle opacità a \"vetro smerigliato“\\n(frecce nere vuote).\\n ', 'COVID-19: case 1\\nby:\\nA. Borgheresi, A. Agostini, L. Ottaviani, C. Floridi, A. Giovagnoni\\nDi p a r t i m en t o d i S c i e n ze Ra d i o l o g ic h e - S c u o l a d i Specialization in Radiology\\nPolytechnic University of Marche - Ancona (Italy)\\nAzienda Ospedali Riuniti - Torrette\\nANCONA\\nHRTC of an 80-year-old man with dyspnoea and fever tested positive for COVID-19; exam performed 5 days from the onset.\\nTO\\nImage A: reconstruction with Lung algorithm, image\\naxial. Yes\\nappreciate multiple opacities dja “glass\\nfrosted “00 with which it is associated, in particular with the lung lobes\\nlower, thickening of the interlobular septa with alispect a\\n\"crazy paviſing o\" black circle). There is also thickening\\nlinear with mantle-subpleural distribution (black arrow\\nfull).\\nB\\nImage B: Coronal reconstruction showing the distribution\\nmainly peripheral of \"frosted glass\" opacities\\n(empty black arrows).\\n', ' COVID 19 1 by A Borgheresi A Agostini L Ottaviani C Floridi A Giovagnoni Di S Ra S Specialization in Radiology Polytechnic University Marche Ancona Italy Azienda Ospedali Riuniti Torrette ANCONA HRTC an 80 year old man with dyspnoea and fever tested positive for COVID 19 exam performed 5 days from the onset TO Image A reconstruction with Lung algorithm image axial Yes appreciate multiple opacities dja glass frosted 00 with which it is associated in particular with the lung lobes lower thickening the interlobular septa with alispect crazy pavi ing black circle There is also thickening linear with mantle subpleural distribution black arrow full B Image B Coronal reconstruction showing the distribution mainly peripheral frosted glass opacities empty black arrows'), {'case': 0, 'it_raw_txt': 1, 'eng_raw_txt': 2, 'eng_txt': 3})]\n" ] }, { "data": { "text/plain": [ "False" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returnQueryResults(bq_client, project_id, 'covid19', 'ISMIR_cases', 'case1')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Upload to Datastore" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# https://www.kdnuggets.com/2019/04/text-preprocessing-nlp-machine-learning.html\n", "# Load model\n", "# en_ner_bionlp13cg_md or en_core_sci_lg\n", "#nlp = spacy.load(\"en_core_sci_lg\")\n", "\n", "nlp = en_core_sci_lg.load()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/sklearn/base.py:318: UserWarning: Trying to unpickle estimator TfidfTransformer from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.\n", " UserWarning)\n", "/opt/conda/lib/python3.7/site-packages/sklearn/base.py:318: UserWarning: Trying to unpickle estimator TfidfVectorizer from version 0.20.3 when using version 0.22.2.post1. This might lead to breaking code or invalid results. Use at your own risk.\n", " UserWarning)\n" ] } ], "source": [ "# Add pipe features to pipeline \n", "linker = UmlsEntityLinker(resolve_abbreviations=True)\n", "nlp.add_pipe(linker)\n", "\n", "# Add the abbreviation pipe to the spacy pipeline.\n", "abbreviation_pipe = AbbreviationDetector(nlp)\n", "nlp.add_pipe(abbreviation_pipe)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def medicalEntityExtraction(doc):\n", " # convert text to vector\n", " display_text = displacy.render(doc,jupyter=True,style='ent')\n", " annotated_entities = set([(X.text, X.label_) for X in doc.ents])\n", " return display_text, annotated_entities" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def addTask(client, entities_dict):\n", " key = client.key('case', doc_title)\n", " task = datastore.Entity(key=key)\n", " task.update(\n", " entities_dict)\n", " client.put(task)\n", " # Then get by key for this entity\n", " return client.get(key)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The upload of case1 entities is done.\n", "The upload of case10 entities is done.\n", "The upload of case11 entities is done.\n", "The upload of case12 entities is done.\n", "The upload of case13 entities is done.\n", "The upload of case14 entities is done.\n", "The upload of case15 entities is done.\n", "The upload of case16 entities is done.\n", "The upload of case17 entities is done.\n", "The upload of case18 entities is done.\n", "The upload of case19 entities is done.\n", "The upload of case2 entities is done.\n", "The upload of case20 entities is done.\n", "The upload of case21 entities is done.\n", "The upload of case22 entities is done.\n", "The upload of case23 entities is done.\n", "The upload of case24 entities is done.\n", "The upload of case25 entities is done.\n", "The upload of case26 entities is done.\n", "The upload of case27 entities is done.\n", "The upload of case28 entities is done.\n", "The upload of case29 entities is done.\n", "The upload of case3 entities is done.\n", "The upload of case30 entities is done.\n", "The upload of case31 entities is done.\n", "The upload of case32 entities is done.\n", "The upload of case33 entities is done.\n", "The upload of case34 entities is done.\n", "The upload of case35 entities is done.\n", "The upload of case36 entities is done.\n", "The upload of case37 entities is done.\n", "The upload of case38 entities is done.\n", "The upload of case39 entities is done.\n", "The upload of case4 entities is done.\n", "The upload of case40 entities is done.\n", "The upload of case41 entities is done.\n", "The upload of case42 entities is done.\n", "The upload of case43 entities is done.\n", "The upload of case44 entities is done.\n", "The upload of case45 entities is done.\n", "The upload of case46 entities is done.\n", "The upload of case47 entities is done.\n", "The upload of case48 entities is done.\n", "The upload of case49 entities is done.\n", "The upload of case5 entities is done.\n", "The upload of case6 entities is done.\n", "The upload of case7 entities is done.\n", "The upload of case8 entities is done.\n", "The upload of case9 entities is done.\n" ] } ], "source": [ "# list of blobs\n", "bucket_name = 'covid19-public-dataset-aketari'\n", "gcs_source_prefix = 'curated_eng_txt'\n", "lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n", " prefix=gcs_source_prefix)\n", "\n", "\n", "for blob in lst_blobs:\n", " doc_title = blob.name.split('/')[-1].split('.txt')[0]\n", " \n", " # download as string\n", " eng_string = blob.download_as_string().decode('utf-8')\n", " \n", " # convert to vector\n", " doc = nlp(eng_string)\n", " \n", " # Extract medical entities\n", " pattern = 'T(\\d{3})'\n", "\n", " UMLS_tuis_entity = {}\n", " entity_dict = {}\n", "\n", "\n", " for idx in range(len(doc.ents)):\n", " entity = doc.ents[idx]\n", " entity_dict[entity] = ''\n", " for umls_ent in entity._.umls_ents:\n", " entity_dict[entity] = linker.umls.cui_to_entity[umls_ent[0]]\n", "\n", " tui = re.search(pattern, str(entity_dict[entity]))\n", " if tui:\n", " UMLS_tuis_entity[str(entity)] = tui.group()\n", " else:\n", " UMLS_tuis_entity[str(entity)] = None\n", " \n", " # generate dataframes\n", " entities = list(UMLS_tuis_entity.keys())\n", " TUIs = list(UMLS_tuis_entity.values())\n", " df_entities = pd.DataFrame(data={'entity':entities,'TUIs':TUIs})\n", " df_reference_TUIs = pd.read_csv('UMLS_tuis.csv')\n", " df_annotated_text_entities = pd.merge(df_entities,df_reference_TUIs,how='inner',on=['TUIs'])\n", " \n", " # upload entities to datastore\n", " entities_dict = {}\n", " for idx in range(df_annotated_text_entities.shape[0]):\n", " \n", " category = df_annotated_text_entities.iloc[idx].values[2]\n", " \n", " #TUI = df_annotated_text_entities.iloc[idx].values[1]\n", " #entities_dict[category].append(TUI)\n", " \n", " med_entity = df_annotated_text_entities.iloc[idx].values[0]\n", " \n", " \n", " # Append to list of entities if the key,value pair already exist\n", " try:\n", " entities_dict[category].append(med_entity)\n", " \n", " except:\n", " entities_dict[category] = []\n", " entities_dict[category].append(med_entity)\n", " \n", " \n", " # API call\n", " key = addTask(datastore_client, entities_dict)\n", " print('The upload of {} entities is done.'.format(doc_title))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | entity | \n", "TUIs | \n", "Categories | \n", "
---|---|---|---|
0 | \n", "HRTC | \n", "T059 | \n", "Laboratory Procedure | \n", "
1 | \n", "year | \n", "T079 | \n", "Temporal Concept | \n", "
2 | \n", "man | \n", "T047 | \n", "Disease or Syndrome | \n", "
3 | \n", "dyspnoea | \n", "T047 | \n", "Disease or Syndrome | \n", "
4 | \n", "fever | \n", "T109 | \n", "Organic Chemical | \n", "