|
@@ -2,9 +2,25 @@ from google.cloud import storage, bigquery, datastore
|
|
from google.oauth2 import service_account
|
|
from google.oauth2 import service_account
|
|
from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ
|
|
from utils.bq_fcn import bqCreateDataset, bqCreateTable, exportItems2BQ
|
|
from utils.ner_fcn import loadModel, addTask, extractMedEntities
|
|
from utils.ner_fcn import loadModel, addTask, extractMedEntities
|
|
-import en_core_sci_lg
|
|
|
|
|
|
|
|
import logging
|
|
import logging
|
|
|
|
+logging.getLogger().setLevel(logging.INFO)
|
|
|
|
+
|
|
|
|
+try:
|
|
|
|
+ import en_core_sci_sm
|
|
|
|
+except:
|
|
|
|
+ logging.warning("404: en_core_sci_sm NOT FOUND. Make sure the model was downloaded and installed.")
|
|
|
|
+
|
|
|
|
+try:
|
|
|
|
+ import en_core_sci_lg
|
|
|
|
+except:
|
|
|
|
+ logging.warning("404: en_core_sci_lg NOT FOUND. Make sure the model was downloaded and installed.")
|
|
|
|
+try:
|
|
|
|
+ import en_ner_bionlp13cg_md
|
|
|
|
+except:
|
|
|
|
+ logging.warning("404: en_ner_bionlp13cg_md NOT FOUND. Make sure the model was downloaded and installed.")
|
|
|
|
+
|
|
|
|
+
|
|
import time
|
|
import time
|
|
import os
|
|
import os
|
|
import pandas as pd
|
|
import pandas as pd
|
|
@@ -22,6 +38,8 @@ storage_client = storage.Client(credentials=credentials)
|
|
|
|
|
|
datastore_client = datastore.Client(credentials=credentials)
|
|
datastore_client = datastore.Client(credentials=credentials)
|
|
|
|
|
|
|
|
+bq_client = bigquery.Client(credentials=credentials)
|
|
|
|
+
|
|
gcs_source_prefix = 'raw_txt'
|
|
gcs_source_prefix = 'raw_txt'
|
|
lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
prefix=gcs_source_prefix)
|
|
prefix=gcs_source_prefix)
|
|
@@ -29,13 +47,13 @@ lst_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
|
|
|
try:
|
|
try:
|
|
- dataset_id = bqCreateDataset(dataset_name)
|
|
|
|
|
|
+ dataset_id = bqCreateDataset(bq_client, dataset_name)
|
|
logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
|
|
logging.info("The following dataset {} was successfully created/retrieved.".format(dataset_name))
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logging.error("An error occurred.", e)
|
|
logging.error("An error occurred.", e)
|
|
|
|
|
|
try:
|
|
try:
|
|
- table_id = bqCreateTable(dataset_id, table_name)
|
|
|
|
|
|
+ table_id = bqCreateTable(bq_client, dataset_id, table_name)
|
|
logging.info("The following table {} was successfully created/retrieved.".format(table_name))
|
|
logging.info("The following table {} was successfully created/retrieved.".format(table_name))
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logging.error("An error occurred.", e)
|
|
logging.error("An error occurred.", e)
|
|
@@ -47,9 +65,7 @@ for blob in lst_blobs:
|
|
it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
|
|
it_raw_blob = storage_client.get_bucket(bucket_name).get_blob('raw_txt/{}.txt'.format(doc_title))
|
|
|
|
|
|
# set the GCS path
|
|
# set the GCS path
|
|
- path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title,
|
|
|
|
- bucket_name,
|
|
|
|
- doc_title)
|
|
|
|
|
|
+ path_blob_eng_raw = 'eng_txt/{}/{}_raw_txt_{}_en_translations.txt'.format(doc_title, bucket_name, doc_title)
|
|
eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
|
|
eng_raw_blob = storage_client.get_bucket(bucket_name).get_blob(path_blob_eng_raw)
|
|
|
|
|
|
# Upload blob of interest
|
|
# Upload blob of interest
|
|
@@ -57,7 +73,7 @@ for blob in lst_blobs:
|
|
.get_blob('curated_eng_txt/{}.txt'.format(doc_title))
|
|
.get_blob('curated_eng_txt/{}.txt'.format(doc_title))
|
|
|
|
|
|
# populate to BQ dataset
|
|
# populate to BQ dataset
|
|
- exportItems2BQ(dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
|
|
|
|
|
|
+ exportItems2BQ(bq_client, dataset_id, table_id, doc_title, it_raw_blob, eng_raw_blob, curated_eng_blob)
|
|
|
|
|
|
total_time = time.time() - start_time
|
|
total_time = time.time() - start_time
|
|
logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1)))
|
|
logging.info('The export to BigQuery was completed successfully and took {} minutes.'.format(round(total_time / 60, 1)))
|
|
@@ -66,7 +82,7 @@ curated_gcs_source_prefix = 'curated_eng_txt'
|
|
lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
lst_curated_blobs = storage_client.list_blobs(bucket_or_name=bucket_name,
|
|
prefix=curated_gcs_source_prefix)
|
|
prefix=curated_gcs_source_prefix)
|
|
|
|
|
|
-nlp = loadModel(model=en_core_sci_lg)
|
|
|
|
|
|
+nlp = loadModel(model=en_core_sci_sm)
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
for blob in lst_curated_blobs:
|
|
for blob in lst_curated_blobs:
|