瀏覽代碼

updated notebooks

Aziz Ketari 4 年之前
父節點
當前提交
27bee22cde
共有 4 個文件被更改,包括 49 次插入65 次删除
  1. 二進制
      .DS_Store
  2. 二進制
      notebooks/.DS_Store
  3. 26 37
      notebooks/end2end_pipeline.html
  4. 23 28
      notebooks/end2end_pipeline.ipynb

二進制
.DS_Store


二進制
notebooks/.DS_Store


+ 26 - 37
notebooks/end2end_draft_flow.html → notebooks/end2end_pipeline.html

@@ -2,7 +2,7 @@
 <html>
 <head><meta charset="utf-8" />
 
-<title>ETL</title>
+<title>end2end_pipeline</title>
 
 <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script>
@@ -13110,10 +13110,8 @@ div#notebook {
 <div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
 </div><div class="inner_cell">
 <div class="text_cell_render border-box-sizing rendered_html">
-<p>References:</p>
-<ul>
-<li>ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing</li>
-</ul>
+<p><strong>Note:</strong></p>
+<p>This notebook was ran on AI Platform Notebook instance.If you are running this notebook on your local machine, you need to provide the service account credentials in order to authenticate when making the API calls.</p>
 
 </div>
 </div>
@@ -13123,8 +13121,14 @@ div#notebook {
 <div class="prompt input_prompt">In&nbsp;[2]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">project_id</span> <span class="o">=</span> <span class="s2">&quot;pm-preparation&quot;</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># TODO: replace with your settings</span>
+<span class="n">project_id</span> <span class="o">=</span> <span class="s2">&quot;your_project_id&quot;</span>
 <span class="n">location</span> <span class="o">=</span> <span class="s2">&quot;us-central1&quot;</span>
+<span class="n">bucket_name</span> <span class="o">=</span> <span class="s1">&#39;bucket_name&#39;</span>
+<span class="n">bq_dataset_name</span> <span class="o">=</span> <span class="s1">&#39;name_of_your_choice&#39;</span>
+<span class="n">bq_table_name</span> <span class="o">=</span> <span class="s1">&#39;name_of_your_choice_2&#39;</span>
+
+
 <span class="n">storage_client</span> <span class="o">=</span> <span class="n">storage</span><span class="o">.</span><span class="n">Client</span><span class="p">()</span>
 <span class="n">vision_client</span> <span class="o">=</span> <span class="n">vision</span><span class="o">.</span><span class="n">ImageAnnotatorClient</span><span class="p">()</span>
 <span class="n">translate_client</span> <span class="o">=</span> <span class="n">translate</span><span class="o">.</span><span class="n">TranslationServiceClient</span><span class="p">()</span>
@@ -13299,14 +13303,6 @@ div#notebook {
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
-</div><div class="inner_cell">
-<div class="text_cell_render border-box-sizing rendered_html">
-<p><strong>This step will take 1hr and 20 min approx</strong></p>
-
-</div>
-</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -13336,8 +13332,6 @@ div#notebook {
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Process documents</span>
-
-<span class="n">bucket_name</span> <span class="o">=</span> <span class="s1">&#39;covid19-public-dataset-aketari&#39;</span>
 <span class="n">gcs_source_prefix</span> <span class="o">=</span> <span class="s1">&#39;pdf&#39;</span>
 <span class="n">lst_pdf_blobs</span> <span class="o">=</span> <span class="n">storage_client</span><span class="o">.</span><span class="n">list_blobs</span><span class="p">(</span><span class="n">bucket_or_name</span><span class="o">=</span><span class="n">bucket_name</span><span class="p">,</span> 
                                       <span class="n">prefix</span><span class="o">=</span><span class="s1">&#39;pdf&#39;</span><span class="p">)</span>
@@ -13744,10 +13738,9 @@ case9 processing is done.
 <div class="prompt input_prompt">In&nbsp;[19]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="n">bucket_name</span> <span class="o">=</span> <span class="s1">&#39;covid19-public-dataset-aketari&#39;</span>
-<span class="n">gcs_source_prefix</span> <span class="o">=</span> <span class="s1">&#39;raw_txt&#39;</span>
-<span class="n">dataset_id</span> <span class="o">=</span> <span class="n">bqCreateDataset</span><span class="p">(</span><span class="s1">&#39;covid19&#39;</span><span class="p">)</span>
-<span class="n">table_id</span> <span class="o">=</span> <span class="n">bqCreateTable</span><span class="p">(</span><span class="n">dataset_id</span><span class="p">,</span> <span class="s1">&#39;ISMIR_cases&#39;</span><span class="p">)</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="n">gcs_source_prefix</span> <span class="o">=</span> <span class="s1">&#39;raw_txt&#39;</span>
+<span class="n">dataset_id</span> <span class="o">=</span> <span class="n">bqCreateDataset</span><span class="p">(</span><span class="n">bq_dataset_name</span><span class="p">)</span>
+<span class="n">table_id</span> <span class="o">=</span> <span class="n">bqCreateTable</span><span class="p">(</span><span class="n">dataset_id</span><span class="p">,</span> <span class="n">bq_table_name</span><span class="p">)</span>
 
 <span class="n">lst_blobs</span> <span class="o">=</span> <span class="n">storage_client</span><span class="o">.</span><span class="n">list_blobs</span><span class="p">(</span><span class="n">bucket_or_name</span><span class="o">=</span><span class="n">bucket_name</span><span class="p">,</span> 
                                       <span class="n">prefix</span><span class="o">=</span><span class="n">gcs_source_prefix</span><span class="p">)</span>
@@ -13940,10 +13933,8 @@ case9 was added to covid19 dataset, specifically in ISMIR_cases table.
 <div class="prompt input_prompt">In&nbsp;[3]:</div>
 <div class="inner_cell">
     <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># https://www.kdnuggets.com/2019/04/text-preprocessing-nlp-machine-learning.html</span>
-<span class="c1"># Load model</span>
+<div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># Load model</span>
 <span class="c1"># en_ner_bionlp13cg_md or en_core_sci_lg</span>
-<span class="c1">#nlp = spacy.load(&quot;en_core_sci_lg&quot;)</span>
 
 <span class="n">nlp</span> <span class="o">=</span> <span class="n">en_core_sci_lg</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
 </pre></div>
@@ -14029,19 +14020,6 @@ case9 was added to covid19 dataset, specifically in ISMIR_cases table.
 </div>
 </div>
 
-</div>
-<div class="cell border-box-sizing code_cell rendered">
-<div class="input">
-<div class="prompt input_prompt">In&nbsp;[27]:</div>
-<div class="inner_cell">
-    <div class="input_area">
-<div class=" highlight hl-ipython3"><pre><span></span> 
-</pre></div>
-
-    </div>
-</div>
-</div>
-
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">
@@ -14049,7 +14027,6 @@ case9 was added to covid19 dataset, specifically in ISMIR_cases table.
 <div class="inner_cell">
     <div class="input_area">
 <div class=" highlight hl-ipython3"><pre><span></span><span class="c1"># list of blobs</span>
-<span class="n">bucket_name</span> <span class="o">=</span> <span class="s1">&#39;covid19-public-dataset-aketari&#39;</span>
 <span class="n">gcs_source_prefix</span> <span class="o">=</span> <span class="s1">&#39;curated_eng_txt&#39;</span>
 <span class="n">lst_blobs</span> <span class="o">=</span> <span class="n">storage_client</span><span class="o">.</span><span class="n">list_blobs</span><span class="p">(</span><span class="n">bucket_or_name</span><span class="o">=</span><span class="n">bucket_name</span><span class="p">,</span> 
                                       <span class="n">prefix</span><span class="o">=</span><span class="n">gcs_source_prefix</span><span class="p">)</span>
@@ -14328,6 +14305,18 @@ The upload of case9 entities is done.
 </div>
 </div>
 
+</div>
+<div class="cell border-box-sizing text_cell rendered"><div class="prompt input_prompt">
+</div><div class="inner_cell">
+<div class="text_cell_render border-box-sizing rendered_html">
+<hr>
+<p>References:</p>
+<ul>
+<li><a href="https://arxiv.org/abs/1902.07669">ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing</a></li>
+</ul>
+
+</div>
+</div>
 </div>
 <div class="cell border-box-sizing code_cell rendered">
 <div class="input">

+ 23 - 28
notebooks/end2end_draft_flow.ipynb → notebooks/end2end_pipeline.ipynb

@@ -21,11 +21,10 @@
     "\n",
     "#!sudo pip3 install scispacy\n",
     "import scispacy\n",
-    "import spacy\n",
     "from spacy import displacy\n",
     "#https://github.com/explosion/spacy-models/releases/download/en_core_sci_sm-2.2.0/en_core_sci_sm-2.2.0.tar.gz\n",
     "#https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz\n",
-    "import en_core_sci_sm, en_core_sci_lg #, en_ner_bionlp13cg_md\n",
+    "import en_core_sci_lg # en_ner_bionlp13cg_md, en_core_sci_sm\n",
     "from scispacy.umls_linking import UmlsEntityLinker\n",
     "from scispacy.abbreviation import AbbreviationDetector"
    ]
@@ -34,8 +33,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "References:\n",
-    "- ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing"
+    "**Note:** \n",
+    "\n",
+    "This notebook was ran on AI Platform Notebook instance.If you are running this notebook on your local machine, you need to provide the service account credentials in order to authenticate when making the API calls."
    ]
   },
   {
@@ -44,8 +44,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "project_id = \"pm-preparation\"\n",
+    "# TODO: replace with your settings\n",
+    "project_id = \"your_project_id\"\n",
     "location = \"us-central1\"\n",
+    "bucket_name = 'bucket_name'\n",
+    "bq_dataset_name = 'name_of_your_choice'\n",
+    "bq_table_name = 'name_of_your_choice_2'\n",
+    "\n",
+    "\n",
     "storage_client = storage.Client()\n",
     "vision_client = vision.ImageAnnotatorClient()\n",
     "translate_client = translate.TranslationServiceClient()\n",
@@ -196,13 +202,6 @@
     "    return string"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**This step will take 1hr and 20 min approx**"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -436,8 +435,6 @@
    ],
    "source": [
     "# Process documents\n",
-    "\n",
-    "bucket_name = 'covid19-public-dataset-aketari'\n",
     "gcs_source_prefix = 'pdf'\n",
     "lst_pdf_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n",
     "                                      prefix='pdf')\n",
@@ -678,10 +675,9 @@
     }
    ],
    "source": [
-    "bucket_name = 'covid19-public-dataset-aketari'\n",
     "gcs_source_prefix = 'raw_txt'\n",
-    "dataset_id = bqCreateDataset('covid19')\n",
-    "table_id = bqCreateTable(dataset_id, 'ISMIR_cases')\n",
+    "dataset_id = bqCreateDataset(bq_dataset_name)\n",
+    "table_id = bqCreateTable(dataset_id, bq_table_name)\n",
     "\n",
     "lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n",
     "                                      prefix=gcs_source_prefix)\n",
@@ -781,10 +777,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# https://www.kdnuggets.com/2019/04/text-preprocessing-nlp-machine-learning.html\n",
     "# Load model\n",
     "# en_ner_bionlp13cg_md or en_core_sci_lg\n",
-    "#nlp = spacy.load(\"en_core_sci_lg\")\n",
     "\n",
     "nlp = en_core_sci_lg.load()"
    ]
@@ -844,13 +838,6 @@
     "    return client.get(key)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -914,7 +901,6 @@
    ],
    "source": [
     "# list of blobs\n",
-    "bucket_name = 'covid19-public-dataset-aketari'\n",
     "gcs_source_prefix = 'curated_eng_txt'\n",
     "lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name, \n",
     "                                      prefix=gcs_source_prefix)\n",
@@ -952,7 +938,7 @@
     "    entities = list(UMLS_tuis_entity.keys())\n",
     "    TUIs = list(UMLS_tuis_entity.values())\n",
     "    df_entities = pd.DataFrame(data={'entity':entities,'TUIs':TUIs})\n",
-    "    df_reference_TUIs = pd.read_csv('UMLS_tuis.csv')\n",
+    "    df_reference_TUIs = pd.read_csv('./data/UMLS_tuis.csv')\n",
     "    df_annotated_text_entities = pd.merge(df_entities,df_reference_TUIs,how='inner',on=['TUIs'])\n",
     "    \n",
     "    # upload entities to datastore\n",
@@ -1102,6 +1088,15 @@
     "getCases(datastore_client,filter_dict)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "References:\n",
+    "- [ScispaCy: Fast and Robust Models for Biomedical Natural Language Processing](https://arxiv.org/abs/1902.07669)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,