diff --git a/datasets/idc/_images/copy_bq_datasets/Dockerfile b/datasets/idc/_images/copy_bq_datasets/Dockerfile new file mode 100644 index 000000000..de6dfa5ad --- /dev/null +++ b/datasets/idc/_images/copy_bq_datasets/Dockerfile @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8 +ENV PYTHONUNBUFFERED True +COPY requirements.txt ./ +RUN python3 -m pip install --no-cache-dir -r requirements.txt +WORKDIR /custom +COPY ./script.py . +CMD ["python3", "script.py"] diff --git a/datasets/idc/_images/copy_bq_datasets/requirements.txt b/datasets/idc/_images/copy_bq_datasets/requirements.txt new file mode 100644 index 000000000..a567c09b4 --- /dev/null +++ b/datasets/idc/_images/copy_bq_datasets/requirements.txt @@ -0,0 +1,3 @@ +google-api-core +google-cloud-bigquery-datatransfer +protobuf diff --git a/datasets/idc/_images/copy_bq_datasets/script.py b/datasets/idc/_images/copy_bq_datasets/script.py new file mode 100644 index 000000000..89509167f --- /dev/null +++ b/datasets/idc/_images/copy_bq_datasets/script.py @@ -0,0 +1,182 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import operator +import os +import time +import typing + +from google.api_core.exceptions import ResourceExhausted +from google.cloud import bigquery_datatransfer_v1 +from google.protobuf.timestamp_pb2 import Timestamp + +RETRY_DELAY = 10 + + +class TimeoutError(Exception): + """Raised when the BQ transfer jobs haven't all finished within the allotted time""" + + pass + + +def main( + source_project_id: str, + target_project_id: str, + service_account: str, + dataset_name: str, + dataset_versions: typing.List[str], + timeout: int, +): + client = bigquery_datatransfer_v1.DataTransferServiceClient() + transfer_config_prefix = f"{dataset_name}-copy" + transfer_configs = client.list_transfer_configs( + request=bigquery_datatransfer_v1.types.ListTransferConfigsRequest( + parent=f"projects/{target_project_id}" + ) + ) + + existing_configs = [ + config + for config in transfer_configs + if config.display_name.startswith(transfer_config_prefix) + ] + + _running_configs = [] + for version in dataset_versions: + dataset_id = f"{dataset_name}_{version}" + display_name = f"{transfer_config_prefix}-{version}" + + _config = next( + ( + config + for config in existing_configs + if config.display_name == display_name + ), + None, + ) + if not _config: + _config = create_transfer_config( + client, + source_project_id, + target_project_id, + dataset_id, + display_name, + service_account, + ) + + trigger_config(client, _config) + _running_configs.append(_config) + + wait_for_completion(client, _running_configs, timeout) + + +def wait_for_completion( + client: bigquery_datatransfer_v1.DataTransferServiceClient, + running_configs: typing.List[bigquery_datatransfer_v1.types.TransferConfig], + timeout: int, +) -> None: + _start = int(time.time()) + + while True: + latest_runs = [] + for config in running_configs: + latest_runs.append(latest_transfer_run(client, config)) + + logging.info(f"States: {[str(run.state) for run in latest_runs]}") + + # Mark as complete when all runs have succeeded + if all([str(run.state) == "TransferState.SUCCEEDED" for run in latest_runs]): + return + + # Stop the process when it's longer than the allotted time + if int(time.time()) - _start > timeout: + raise TimeoutError + + time.sleep(RETRY_DELAY) + + +def latest_transfer_run( + client: bigquery_datatransfer_v1.DataTransferServiceClient, + config: bigquery_datatransfer_v1.types.TransferConfig, +) -> bigquery_datatransfer_v1.types.TransferRun: + transfer_runs = client.list_transfer_runs(parent=config.name) + return max(transfer_runs, key=operator.attrgetter("run_time")) + + +def create_transfer_config( + client: bigquery_datatransfer_v1.DataTransferServiceClient, + source_project_id: str, + target_project_id: str, + dataset_id: str, + display_name: str, + service_account: str, +) -> bigquery_datatransfer_v1.types.TransferConfig: + transfer_config = bigquery_datatransfer_v1.TransferConfig( + destination_dataset_id=dataset_id, + display_name=display_name, + data_source_id="cross_region_copy", + dataset_region="US", + params={ + "source_project_id": source_project_id, + "source_dataset_id": dataset_id, + }, + schedule_options=bigquery_datatransfer_v1.ScheduleOptions( + disable_auto_scheduling=True + ), + ) + + request = bigquery_datatransfer_v1.types.CreateTransferConfigRequest( + parent=client.common_project_path(target_project_id), + transfer_config=transfer_config, + service_account_name=service_account, + ) + + return client.create_transfer_config(request=request) + + +def trigger_config( + client: bigquery_datatransfer_v1.DataTransferServiceClient, + config: bigquery_datatransfer_v1.types.TransferConfig, +) -> None: + now = time.time() + seconds = int(now) + nanos = int((now - seconds) * 10 ** 9) + + try: + client.start_manual_transfer_runs( + request=bigquery_datatransfer_v1.types.StartManualTransferRunsRequest( + parent=config.name, + requested_run_time=Timestamp(seconds=seconds, nanos=nanos), + ) + ) + except ResourceExhausted: + logging.info( + f"Transfer job is currently running for config ({config.display_name}) {config.name}." + ) + return + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + source_project_id=os.environ["SOURCE_PROJECT_ID"], + target_project_id=os.environ["TARGET_PROJECT_ID"], + service_account=os.environ["SERVICE_ACCOUNT"], + dataset_name=os.environ["DATASET_NAME"], + dataset_versions=json.loads(os.environ["DATASET_VERSIONS"]), + timeout=int(os.getenv("TIMEOUT", 1200)), + ) diff --git a/datasets/idc/_images/generate_bq_views/Dockerfile b/datasets/idc/_images/generate_bq_views/Dockerfile new file mode 100644 index 000000000..c22867d65 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/Dockerfile @@ -0,0 +1,21 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8 +ENV PYTHONUNBUFFERED True +COPY requirements.txt ./ +RUN python3 -m pip install --no-cache-dir -r requirements.txt +WORKDIR /custom +COPY . . +CMD ["python3", "script.py"] diff --git a/datasets/idc/_images/generate_bq_views/queries/.DS_Store b/datasets/idc/_images/generate_bq_views/queries/.DS_Store new file mode 100644 index 000000000..5d8b8366a Binary files /dev/null and b/datasets/idc/_images/generate_bq_views/queries/.DS_Store differ diff --git a/datasets/idc/_images/generate_bq_views/queries/v1/dicom_all.sql b/datasets/idc/_images/generate_bq_views/queries/v1/dicom_all.sql new file mode 100644 index 000000000..65c0bdaeb --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v1/dicom_all.sql @@ -0,0 +1,39 @@ +WITH + pre_dicom_all +AS ( + SELECT + aux.idc_webapp_collection_id AS collection_id, + aux.gcs_url as gcs_url, + aux.gcs_bucket as gcs_bucket, + aux.study_uuid as crdc_study_uuid, + aux.series_uuid as crdc_series_uuid, + aux.instance_uuid as crdc_instance_uuid, + aux.idc_case_id as idc_case_id, + aux.instance_size as instance_size, + aux.version_hash as version_hash, + aux.collection_hash as collection_hash, + aux.patient_hash as patient_hash, + aux.study_hash as study_hash, + aux.series_hash as series_hash, + aux.instance_hash as instance_hash, + aux.source_doi as Source_DOI, + dcm.* + FROM + `PROJECT.DATASET.auxiliary_metadata` AS aux + INNER JOIN + `PROJECT.DATASET.dicom_metadata` AS dcm + ON + aux.SOPInstanceUID = dcm.SOPInstanceUID +) + +SELECT + data_collections.Location AS tcia_tumorLocation, + data_collections.Species AS tcia_species, + data_collections.CancerType AS tcia_cancerType, + pre_dicom_all.* +FROM + pre_dicom_all +INNER JOIN + `PROJECT.DATASET.original_collections_metadata` AS data_collections +ON + pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id diff --git a/datasets/idc/_images/generate_bq_views/queries/v1/measurement_groups.sql b/datasets/idc/_images/generate_bq_views/queries/v1/measurement_groups.sql new file mode 100644 index 000000000..df0a8f046 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v1/measurement_groups.sql @@ -0,0 +1,208 @@ +WITH + measurementGroups AS ( + WITH + contentSequenceLevel1 AS ( + WITH + structuredReports AS ( + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + ContentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + ( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) + AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 + AND ContentTemplateSequence [ + OFFSET + (0)].TemplateIdentifier = "1500" + AND ContentTemplateSequence [ + OFFSET + (0)].MappingResource = "DCMR" ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence + FROM + structuredReports + CROSS JOIN + UNNEST(ContentSequence) AS contentSequence ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence, + measurementGroup_number + FROM + contentSequenceLevel1 + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WITH + OFFSET + AS measurementGroup_number + WHERE + contentSequence.ValueType = "CONTAINER" + AND contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeMeaning = "Measurement Group" ), + measurementGroups_withTrackingID AS ( + SELECT + SOPInstanceUID, + PatientID, + SeriesDescription, + measurementGroup_number, + unnestedContentSequence.TextValue AS trackingIdentifier, + measurementGroups.contentSequence + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "TEXT" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112039" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withTrackingUID AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS trackingUniqueIdentifier + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112040" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withSegmentation AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSegmentNumber AS segmentationSegmentNumber + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "IMAGE" + AND unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), + measurementGroups_withSourceSeries AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS sourceSegmentedSeriesUID + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121232" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFinding AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS finding + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121071" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFindingSite AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS findingSite + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "G-C0E3" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "SRT" ) ) +SELECT + mWithUID.SOPInstanceUID, + mWithUID.measurementGroup_number, + mWithUID.trackingUniqueIdentifier, + mWithID.trackingIdentifier, + mWithID.PatientID, + mWithID.SeriesDescription, + mWithFinding.finding, + mWithFindingSite.findingSite, + mWithSourceSeries.sourceSegmentedSeriesUID, + mWithSegmentation.segmentationInstanceUID, + mWithSegmentation.segmentationSegmentNumber, + mWithID.contentSequence +FROM + measurementGroups_withTrackingUID AS mWithUID +JOIN + measurementGroups_withTrackingID AS mWithID + --- +ON + mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number +JOIN + measurementGroups_withFinding AS mWithFinding +ON + mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number +JOIN + measurementGroups_withFindingSite AS mWithFindingSite +ON + mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number +JOIN + measurementGroups_withSourceSeries AS mWithSourceSeries +ON + mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number +JOIN + measurementGroups_withSegmentation AS mWithSegmentation +ON + mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number + --- +ORDER BY + trackingUniqueIdentifier diff --git a/datasets/idc/_images/generate_bq_views/queries/v1/qualitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v1/qualitative_measurements.sql new file mode 100644 index 000000000..d020e4639 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v1/qualitative_measurements.sql @@ -0,0 +1,99 @@ +WITH + contentSequenceLevel3 AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + findingsAndFindingSites AS ( + WITH + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + ConceptCodeSequence AS finding + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + ConceptCodeSequence AS findingSite + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ) + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier, + findingSites.findingSite, + findingSites.measurementGroup_number + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) +SELECT + contentSequenceLevel3.PatientID, + contentSequenceLevel3.SOPInstanceUID, + findingsAndFindingSites.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3.ConceptNameCodeSequence AS Quantity, + contentSequenceLevel3.ConceptCodeSequence AS Value, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite +FROM + contentSequenceLevel3 +JOIN + findingsAndFindingSites +ON + contentSequenceLevel3.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3.measurementGroup_number = findingsAndFindingSites.measurementGroup_number +WHERE + # exclude + ( ConceptNameCodeSequence.CodeMeaning <> "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "DCM" ) AND # Finding + ( ConceptNameCodeSequence.CodeMeaning <> "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "SRT" ) # Finding Site + # correctness check: adding the below should result in a 36 rows column (4 segmented lesions, with 9 evaluations per each) + # AND + # contentSequenceLevel3.PatientID = "LIDC-IDRI-0001" diff --git a/datasets/idc/_images/generate_bq_views/queries/v1/quantitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v1/quantitative_measurements.sql new file mode 100644 index 000000000..4e71a043e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v1/quantitative_measurements.sql @@ -0,0 +1,177 @@ +WITH + --- + contentSequenceLevel3numeric AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)] AS MeasuredValueSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)].MeasurementUnitsCodeSequence [ + SAFE_OFFSET + (0)] AS MeasurementUnits, + contentSequence.ContentSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "NUM" ), + --- + contentSequenceLevel3codes AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + --- + contentSequenceLevel3uidrefs AS ( + SELECT + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence, + measurementGroup_number + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "UIDREF" + AND ConceptCodeSequence [ + SAFE_OFFSET + (0)].CodeMeaning = "Tracking Unique Identifier" ), + --- + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS finding, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + --- + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS findingSite, + measurementGroup_number + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ), + --- + findingsAndFindingSites AS ( + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findingSites.findingSite, + findingSites.measurementGroup_number, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) --- + # correctness check: the below should result in 11 rows (this is how many segments/measurement + # groups are there for each QIN-HEADNCK-01-0139 segmentation + #SELECT + # * + #FROM + # findingsAndFindingSites + #WHERE + # SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + --- + SELECT + contentSequenceLevel3numeric.PatientID, + contentSequenceLevel3numeric.SOPInstanceUID, + contentSequenceLevel3numeric.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3numeric.ConceptNameCodeSequence AS Quantity, + CASE ( ARRAY_LENGTH(contentSequenceLevel3numeric.ContentSequence) > 0 + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodeValue = "121401" + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodingSchemeDesignator = "DCM" ) + WHEN TRUE THEN STRUCT( contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeValue AS CodeValue, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodingSchemeDesignator AS CodingSchemeDesignator, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeMeaning AS CodeMeaning ) + ELSE + STRUCT(NULL as CodeValue,NULL as CodingSchemeDesignator,NULL as CodeMeaning) + END + AS derivationModifier, + SAFE_CAST( contentSequenceLevel3numeric.MeasuredValueSequence.NumericValue [ + SAFE_OFFSET + (0)] AS NUMERIC ) AS Value, + contentSequenceLevel3numeric.MeasurementUnits AS Units, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite + FROM + contentSequenceLevel3numeric + JOIN + findingsAndFindingSites + ON + contentSequenceLevel3numeric.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3numeric.measurementGroup_number = findingsAndFindingSites.measurementGroup_number --- + # correctness check: for this patient, there should be 12 rows: 4 segmented nodules, with 3 numeric evaluations for each + #WHERE + # contentSequenceLevel3numeric.PatientID = "LIDC-IDRI-0001" + --- + # correctness check: for this specific instance, there should be 238 rows (11 segments) + #WHERE + # contentSequenceLevel3numeric.SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + #where contentSequenceLevel3numeric.PatientID LIKE "%QIN%" diff --git a/datasets/idc/_images/generate_bq_views/queries/v1/segmentations.sql b/datasets/idc/_images/generate_bq_views/queries/v1/segmentations.sql new file mode 100644 index 000000000..44ecc2ddf --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v1/segmentations.sql @@ -0,0 +1,65 @@ +WITH + segs AS ( + SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ) +SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeValue AS CodeValue, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodingSchemeDesignator AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeMeaning AS CodeMeaning ) +END + AS AnatomicRegion, + CASE ( ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( unnested.AnatomicRegionSequence [ + OFFSET + (0)].AnatomicRegionModifierSequence ) > 0 ) + WHEN TRUE THEN unnested.AnatomicRegionSequence [ OFFSET (0)].AnatomicRegionModifierSequence [ OFFSET (0)] #unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL +END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyType, + #unnested.SegmentedPropertyTypeCodeSequence, + #unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID +FROM + segs +CROSS JOIN + UNNEST(SegmentSequence) AS unnested # correctness check: there should be 4 segmented nodules for this subject diff --git a/datasets/idc/_images/generate_bq_views/queries/v2/dicom_all.sql b/datasets/idc/_images/generate_bq_views/queries/v2/dicom_all.sql new file mode 100644 index 000000000..7eeee777f --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v2/dicom_all.sql @@ -0,0 +1,37 @@ +WITH + pre_dicom_all AS ( + SELECT + aux.idc_webapp_collection_id AS collection_id, + aux.gcs_url as gcs_url, + aux.gcs_bucket as gcs_bucket, + aux.study_uuid as crdc_study_uuid, + aux.series_uuid as crdc_series_uuid, + aux.instance_uuid as crdc_instance_uuid, + aux.idc_case_id as idc_case_id, + aux.instance_size as instance_size, + aux.version_hash as version_hash, + aux.collection_hash as collection_hash, + aux.patient_hash as patient_hash, + aux.study_hash as study_hash, + aux.series_hash as series_hash, + aux.instance_hash as instance_hash, + aux.source_doi as Source_DOI, + dcm.* + FROM + `PROJECT.DATASET.auxiliary_metadata` AS aux + INNER JOIN + `PROJECT.DATASET.dicom_metadata` AS dcm + ON + aux.SOPInstanceUID = dcm.SOPInstanceUID) + + SELECT + data_collections.Location AS tcia_tumorLocation, + data_collections.Species AS tcia_species, + data_collections.CancerType AS tcia_cancerType, + pre_dicom_all.* + FROM + pre_dicom_all + INNER JOIN + `PROJECT.DATASET.original_collections_metadata` AS data_collections + ON + pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id diff --git a/datasets/idc/_images/generate_bq_views/queries/v2/measurement_groups.sql b/datasets/idc/_images/generate_bq_views/queries/v2/measurement_groups.sql new file mode 100644 index 000000000..df0a8f046 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v2/measurement_groups.sql @@ -0,0 +1,208 @@ +WITH + measurementGroups AS ( + WITH + contentSequenceLevel1 AS ( + WITH + structuredReports AS ( + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + ContentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + ( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) + AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 + AND ContentTemplateSequence [ + OFFSET + (0)].TemplateIdentifier = "1500" + AND ContentTemplateSequence [ + OFFSET + (0)].MappingResource = "DCMR" ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence + FROM + structuredReports + CROSS JOIN + UNNEST(ContentSequence) AS contentSequence ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence, + measurementGroup_number + FROM + contentSequenceLevel1 + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WITH + OFFSET + AS measurementGroup_number + WHERE + contentSequence.ValueType = "CONTAINER" + AND contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeMeaning = "Measurement Group" ), + measurementGroups_withTrackingID AS ( + SELECT + SOPInstanceUID, + PatientID, + SeriesDescription, + measurementGroup_number, + unnestedContentSequence.TextValue AS trackingIdentifier, + measurementGroups.contentSequence + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "TEXT" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112039" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withTrackingUID AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS trackingUniqueIdentifier + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112040" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withSegmentation AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSegmentNumber AS segmentationSegmentNumber + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "IMAGE" + AND unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), + measurementGroups_withSourceSeries AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS sourceSegmentedSeriesUID + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121232" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFinding AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS finding + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121071" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFindingSite AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS findingSite + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "G-C0E3" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "SRT" ) ) +SELECT + mWithUID.SOPInstanceUID, + mWithUID.measurementGroup_number, + mWithUID.trackingUniqueIdentifier, + mWithID.trackingIdentifier, + mWithID.PatientID, + mWithID.SeriesDescription, + mWithFinding.finding, + mWithFindingSite.findingSite, + mWithSourceSeries.sourceSegmentedSeriesUID, + mWithSegmentation.segmentationInstanceUID, + mWithSegmentation.segmentationSegmentNumber, + mWithID.contentSequence +FROM + measurementGroups_withTrackingUID AS mWithUID +JOIN + measurementGroups_withTrackingID AS mWithID + --- +ON + mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number +JOIN + measurementGroups_withFinding AS mWithFinding +ON + mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number +JOIN + measurementGroups_withFindingSite AS mWithFindingSite +ON + mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number +JOIN + measurementGroups_withSourceSeries AS mWithSourceSeries +ON + mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number +JOIN + measurementGroups_withSegmentation AS mWithSegmentation +ON + mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number + --- +ORDER BY + trackingUniqueIdentifier diff --git a/datasets/idc/_images/generate_bq_views/queries/v2/qualitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v2/qualitative_measurements.sql new file mode 100644 index 000000000..d020e4639 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v2/qualitative_measurements.sql @@ -0,0 +1,99 @@ +WITH + contentSequenceLevel3 AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + findingsAndFindingSites AS ( + WITH + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + ConceptCodeSequence AS finding + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + ConceptCodeSequence AS findingSite + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ) + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier, + findingSites.findingSite, + findingSites.measurementGroup_number + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) +SELECT + contentSequenceLevel3.PatientID, + contentSequenceLevel3.SOPInstanceUID, + findingsAndFindingSites.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3.ConceptNameCodeSequence AS Quantity, + contentSequenceLevel3.ConceptCodeSequence AS Value, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite +FROM + contentSequenceLevel3 +JOIN + findingsAndFindingSites +ON + contentSequenceLevel3.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3.measurementGroup_number = findingsAndFindingSites.measurementGroup_number +WHERE + # exclude + ( ConceptNameCodeSequence.CodeMeaning <> "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "DCM" ) AND # Finding + ( ConceptNameCodeSequence.CodeMeaning <> "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "SRT" ) # Finding Site + # correctness check: adding the below should result in a 36 rows column (4 segmented lesions, with 9 evaluations per each) + # AND + # contentSequenceLevel3.PatientID = "LIDC-IDRI-0001" diff --git a/datasets/idc/_images/generate_bq_views/queries/v2/quantitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v2/quantitative_measurements.sql new file mode 100644 index 000000000..4e71a043e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v2/quantitative_measurements.sql @@ -0,0 +1,177 @@ +WITH + --- + contentSequenceLevel3numeric AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)] AS MeasuredValueSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)].MeasurementUnitsCodeSequence [ + SAFE_OFFSET + (0)] AS MeasurementUnits, + contentSequence.ContentSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "NUM" ), + --- + contentSequenceLevel3codes AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + --- + contentSequenceLevel3uidrefs AS ( + SELECT + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence, + measurementGroup_number + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "UIDREF" + AND ConceptCodeSequence [ + SAFE_OFFSET + (0)].CodeMeaning = "Tracking Unique Identifier" ), + --- + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS finding, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + --- + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS findingSite, + measurementGroup_number + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ), + --- + findingsAndFindingSites AS ( + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findingSites.findingSite, + findingSites.measurementGroup_number, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) --- + # correctness check: the below should result in 11 rows (this is how many segments/measurement + # groups are there for each QIN-HEADNCK-01-0139 segmentation + #SELECT + # * + #FROM + # findingsAndFindingSites + #WHERE + # SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + --- + SELECT + contentSequenceLevel3numeric.PatientID, + contentSequenceLevel3numeric.SOPInstanceUID, + contentSequenceLevel3numeric.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3numeric.ConceptNameCodeSequence AS Quantity, + CASE ( ARRAY_LENGTH(contentSequenceLevel3numeric.ContentSequence) > 0 + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodeValue = "121401" + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodingSchemeDesignator = "DCM" ) + WHEN TRUE THEN STRUCT( contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeValue AS CodeValue, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodingSchemeDesignator AS CodingSchemeDesignator, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeMeaning AS CodeMeaning ) + ELSE + STRUCT(NULL as CodeValue,NULL as CodingSchemeDesignator,NULL as CodeMeaning) + END + AS derivationModifier, + SAFE_CAST( contentSequenceLevel3numeric.MeasuredValueSequence.NumericValue [ + SAFE_OFFSET + (0)] AS NUMERIC ) AS Value, + contentSequenceLevel3numeric.MeasurementUnits AS Units, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite + FROM + contentSequenceLevel3numeric + JOIN + findingsAndFindingSites + ON + contentSequenceLevel3numeric.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3numeric.measurementGroup_number = findingsAndFindingSites.measurementGroup_number --- + # correctness check: for this patient, there should be 12 rows: 4 segmented nodules, with 3 numeric evaluations for each + #WHERE + # contentSequenceLevel3numeric.PatientID = "LIDC-IDRI-0001" + --- + # correctness check: for this specific instance, there should be 238 rows (11 segments) + #WHERE + # contentSequenceLevel3numeric.SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + #where contentSequenceLevel3numeric.PatientID LIKE "%QIN%" diff --git a/datasets/idc/_images/generate_bq_views/queries/v2/segmentations.sql b/datasets/idc/_images/generate_bq_views/queries/v2/segmentations.sql new file mode 100644 index 000000000..b9f2fe647 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v2/segmentations.sql @@ -0,0 +1,80 @@ +WITH + segs AS ( + SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ) +SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeValue AS CodeValue, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodingSchemeDesignator AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeMeaning AS CodeMeaning ) +END + AS AnatomicRegion, + CASE ( ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( unnested.AnatomicRegionSequence [ + OFFSET + (0)].AnatomicRegionModifierSequence ) > 0 ) + WHEN TRUE THEN unnested.AnatomicRegionSequence [ OFFSET (0)].AnatomicRegionModifierSequence [ OFFSET (0)] #unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL +END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyType, + #unnested.SegmentedPropertyTypeCodeSequence, + #unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID +FROM + segs +CROSS JOIN + UNNEST(SegmentSequence) AS unnested # correctness check: there should be 4 segmented nodules for this subject + #where PatientID = "LIDC-IDRI-0001" + # Note that it is possible to have some of those sequences empty! + # Debug: + #WHERE + # ARRAY_LENGTH(unnested.AnatomicRegionSequence) = 0 + # Debug: + # ) +#SELECT +# DISTINCT SegmentedPropertyTypeCodeSequence[ +#OFFSET +# (0)].CodeMeaning +#FROM +# segs_details +#WHERE +# ARRAY_LENGTH(SegmentedPropertyTypeCodeSequence) <> 0 diff --git a/datasets/idc/_images/generate_bq_views/queries/v3/dicom_all.sql b/datasets/idc/_images/generate_bq_views/queries/v3/dicom_all.sql new file mode 100644 index 000000000..93fb70c44 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v3/dicom_all.sql @@ -0,0 +1,51 @@ +WITH + pre_dicom_all AS ( + SELECT + aux.tcia_api_collection_id AS tcia_api_collection_id, + aux.idc_webapp_collection_id AS collection_id, + aux.collection_timestamp AS collection_timestamp, + aux.collection_hash as collection_hash, + aux.collection_init_idc_version AS collection_init_idc_version, + aux.collection_revised_idc_version AS collection_revised_idc_version, + dcm.PatientID as PatientID, + aux.idc_case_id as idc_case_id, + aux.patient_hash as patient_hash, + aux.patient_init_idc_version AS patient_init_idc_version, + aux.patient_revised_idc_version AS patient_revised_idc_version, + dcm.StudyInstanceUID AS StudyInstanceUID, + aux.study_uuid as crdc_study_uuid, + aux.study_hash as study_hash, + aux.study_init_idc_version AS study_init_idc_version, + aux.study_revised_idc_version AS study_revised_idc_version, + dcm.SeriesInstanceUID AS SeriesInstanceUID, + aux.series_uuid as crdc_series_uuid, + aux.series_hash as series_hash, + aux.series_init_idc_version AS series_init_idc_version, + aux.series_revised_idc_version AS series_revised_idc_version, + dcm.SOPInstanceUID AS SOPInstanceUID, + aux.instance_uuid as crdc_instance_uuid, + aux.gcs_url as gcs_url, + aux.instance_size as instance_size, + aux.instance_hash as instance_hash, + aux.instance_init_idc_version AS instance_init_idc_version, + aux.instance_revised_idc_version AS instance_revised_idc_version, + aux.source_doi as Source_DOI, + dcm.* except(PatientID, StudyInstanceUID, SeriesInstanceUID, SOPInstanceUID) + FROM + `PROJECT.DATASET.auxiliary_metadata` AS aux + INNER JOIN + `PROJECT.DATASET.dicom_metadata` AS dcm + ON + aux.SOPInstanceUID = dcm.SOPInstanceUID) + + SELECT + data_collections.Location AS tcia_tumorLocation, + data_collections.Species AS tcia_species, + data_collections.CancerType AS tcia_cancerType, + pre_dicom_all.* + FROM + pre_dicom_all + INNER JOIN + `PROJECT.DATASET.original_collections_metadata` AS data_collections + ON + pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id diff --git a/datasets/idc/_images/generate_bq_views/queries/v3/measurement_groups.sql b/datasets/idc/_images/generate_bq_views/queries/v3/measurement_groups.sql new file mode 100644 index 000000000..df0a8f046 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v3/measurement_groups.sql @@ -0,0 +1,208 @@ +WITH + measurementGroups AS ( + WITH + contentSequenceLevel1 AS ( + WITH + structuredReports AS ( + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + ContentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + ( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) + AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 + AND ContentTemplateSequence [ + OFFSET + (0)].TemplateIdentifier = "1500" + AND ContentTemplateSequence [ + OFFSET + (0)].MappingResource = "DCMR" ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence + FROM + structuredReports + CROSS JOIN + UNNEST(ContentSequence) AS contentSequence ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence, + measurementGroup_number + FROM + contentSequenceLevel1 + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WITH + OFFSET + AS measurementGroup_number + WHERE + contentSequence.ValueType = "CONTAINER" + AND contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeMeaning = "Measurement Group" ), + measurementGroups_withTrackingID AS ( + SELECT + SOPInstanceUID, + PatientID, + SeriesDescription, + measurementGroup_number, + unnestedContentSequence.TextValue AS trackingIdentifier, + measurementGroups.contentSequence + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "TEXT" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112039" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withTrackingUID AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS trackingUniqueIdentifier + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112040" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withSegmentation AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSegmentNumber AS segmentationSegmentNumber + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "IMAGE" + AND unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), + measurementGroups_withSourceSeries AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS sourceSegmentedSeriesUID + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121232" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFinding AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS finding + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121071" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFindingSite AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS findingSite + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "G-C0E3" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "SRT" ) ) +SELECT + mWithUID.SOPInstanceUID, + mWithUID.measurementGroup_number, + mWithUID.trackingUniqueIdentifier, + mWithID.trackingIdentifier, + mWithID.PatientID, + mWithID.SeriesDescription, + mWithFinding.finding, + mWithFindingSite.findingSite, + mWithSourceSeries.sourceSegmentedSeriesUID, + mWithSegmentation.segmentationInstanceUID, + mWithSegmentation.segmentationSegmentNumber, + mWithID.contentSequence +FROM + measurementGroups_withTrackingUID AS mWithUID +JOIN + measurementGroups_withTrackingID AS mWithID + --- +ON + mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number +JOIN + measurementGroups_withFinding AS mWithFinding +ON + mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number +JOIN + measurementGroups_withFindingSite AS mWithFindingSite +ON + mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number +JOIN + measurementGroups_withSourceSeries AS mWithSourceSeries +ON + mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number +JOIN + measurementGroups_withSegmentation AS mWithSegmentation +ON + mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number + --- +ORDER BY + trackingUniqueIdentifier diff --git a/datasets/idc/_images/generate_bq_views/queries/v3/qualitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v3/qualitative_measurements.sql new file mode 100644 index 000000000..d020e4639 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v3/qualitative_measurements.sql @@ -0,0 +1,99 @@ +WITH + contentSequenceLevel3 AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + findingsAndFindingSites AS ( + WITH + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + ConceptCodeSequence AS finding + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + ConceptCodeSequence AS findingSite + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ) + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier, + findingSites.findingSite, + findingSites.measurementGroup_number + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) +SELECT + contentSequenceLevel3.PatientID, + contentSequenceLevel3.SOPInstanceUID, + findingsAndFindingSites.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3.ConceptNameCodeSequence AS Quantity, + contentSequenceLevel3.ConceptCodeSequence AS Value, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite +FROM + contentSequenceLevel3 +JOIN + findingsAndFindingSites +ON + contentSequenceLevel3.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3.measurementGroup_number = findingsAndFindingSites.measurementGroup_number +WHERE + # exclude + ( ConceptNameCodeSequence.CodeMeaning <> "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "DCM" ) AND # Finding + ( ConceptNameCodeSequence.CodeMeaning <> "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "SRT" ) # Finding Site + # correctness check: adding the below should result in a 36 rows column (4 segmented lesions, with 9 evaluations per each) + # AND + # contentSequenceLevel3.PatientID = "LIDC-IDRI-0001" diff --git a/datasets/idc/_images/generate_bq_views/queries/v3/quantitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v3/quantitative_measurements.sql new file mode 100644 index 000000000..4e71a043e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v3/quantitative_measurements.sql @@ -0,0 +1,177 @@ +WITH + --- + contentSequenceLevel3numeric AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)] AS MeasuredValueSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)].MeasurementUnitsCodeSequence [ + SAFE_OFFSET + (0)] AS MeasurementUnits, + contentSequence.ContentSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "NUM" ), + --- + contentSequenceLevel3codes AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + --- + contentSequenceLevel3uidrefs AS ( + SELECT + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence, + measurementGroup_number + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "UIDREF" + AND ConceptCodeSequence [ + SAFE_OFFSET + (0)].CodeMeaning = "Tracking Unique Identifier" ), + --- + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS finding, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + --- + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS findingSite, + measurementGroup_number + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ), + --- + findingsAndFindingSites AS ( + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findingSites.findingSite, + findingSites.measurementGroup_number, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) --- + # correctness check: the below should result in 11 rows (this is how many segments/measurement + # groups are there for each QIN-HEADNCK-01-0139 segmentation + #SELECT + # * + #FROM + # findingsAndFindingSites + #WHERE + # SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + --- + SELECT + contentSequenceLevel3numeric.PatientID, + contentSequenceLevel3numeric.SOPInstanceUID, + contentSequenceLevel3numeric.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3numeric.ConceptNameCodeSequence AS Quantity, + CASE ( ARRAY_LENGTH(contentSequenceLevel3numeric.ContentSequence) > 0 + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodeValue = "121401" + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodingSchemeDesignator = "DCM" ) + WHEN TRUE THEN STRUCT( contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeValue AS CodeValue, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodingSchemeDesignator AS CodingSchemeDesignator, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeMeaning AS CodeMeaning ) + ELSE + STRUCT(NULL as CodeValue,NULL as CodingSchemeDesignator,NULL as CodeMeaning) + END + AS derivationModifier, + SAFE_CAST( contentSequenceLevel3numeric.MeasuredValueSequence.NumericValue [ + SAFE_OFFSET + (0)] AS NUMERIC ) AS Value, + contentSequenceLevel3numeric.MeasurementUnits AS Units, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite + FROM + contentSequenceLevel3numeric + JOIN + findingsAndFindingSites + ON + contentSequenceLevel3numeric.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3numeric.measurementGroup_number = findingsAndFindingSites.measurementGroup_number --- + # correctness check: for this patient, there should be 12 rows: 4 segmented nodules, with 3 numeric evaluations for each + #WHERE + # contentSequenceLevel3numeric.PatientID = "LIDC-IDRI-0001" + --- + # correctness check: for this specific instance, there should be 238 rows (11 segments) + #WHERE + # contentSequenceLevel3numeric.SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + #where contentSequenceLevel3numeric.PatientID LIKE "%QIN%" diff --git a/datasets/idc/_images/generate_bq_views/queries/v3/segmentations.sql b/datasets/idc/_images/generate_bq_views/queries/v3/segmentations.sql new file mode 100644 index 000000000..cbbfceb54 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v3/segmentations.sql @@ -0,0 +1,86 @@ +# TODO: +# * account for SCT codes in addition to SRT +# * add Anatomic Region Modifier +# Debug: +#WITH +# segs_details AS ( +WITH + segs AS ( + SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ) +SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeValue AS CodeValue, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodingSchemeDesignator AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeMeaning AS CodeMeaning ) +END + AS AnatomicRegion, + CASE ( ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( unnested.AnatomicRegionSequence [ + OFFSET + (0)].AnatomicRegionModifierSequence ) > 0 ) + WHEN TRUE THEN unnested.AnatomicRegionSequence [ OFFSET (0)].AnatomicRegionModifierSequence [ OFFSET (0)] #unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL +END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyType, + #unnested.SegmentedPropertyTypeCodeSequence, + #unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID +FROM + segs +CROSS JOIN + UNNEST(SegmentSequence) AS unnested # correctness check: there should be 4 segmented nodules for this subject + #where PatientID = "LIDC-IDRI-0001" + # Note that it is possible to have some of those sequences empty! + # Debug: + #WHERE + # ARRAY_LENGTH(unnested.AnatomicRegionSequence) = 0 + # Debug: + # ) +#SELECT +# DISTINCT SegmentedPropertyTypeCodeSequence[ +#OFFSET +# (0)].CodeMeaning +#FROM +# segs_details +#WHERE +# ARRAY_LENGTH(SegmentedPropertyTypeCodeSequence) <> 0 diff --git a/datasets/idc/_images/generate_bq_views/queries/v4/dicom_all.sql b/datasets/idc/_images/generate_bq_views/queries/v4/dicom_all.sql new file mode 100644 index 000000000..3fc358634 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v4/dicom_all.sql @@ -0,0 +1,54 @@ +WITH + pre_dicom_all AS ( + SELECT + aux.tcia_api_collection_id AS tcia_api_collection_id, + aux.idc_webapp_collection_id AS collection_id, + aux.collection_timestamp AS collection_timestamp, + aux.collection_hash as collection_hash, + aux.collection_init_idc_version AS collection_init_idc_version, + aux.collection_revised_idc_version AS collection_revised_idc_version, + dcm.PatientID as PatientID, + aux.idc_case_id as idc_case_id, + aux.patient_hash as patient_hash, + aux.patient_init_idc_version AS patient_init_idc_version, + aux.patient_revised_idc_version AS patient_revised_idc_version, + dcm.StudyInstanceUID AS StudyInstanceUID, + aux.study_uuid as crdc_study_uuid, + aux.study_hash as study_hash, + aux.study_init_idc_version AS study_init_idc_version, + aux.study_revised_idc_version AS study_revised_idc_version, + dcm.SeriesInstanceUID AS SeriesInstanceUID, + aux.series_uuid as crdc_series_uuid, + aux.series_hash as series_hash, + aux.series_init_idc_version AS series_init_idc_version, + aux.series_revised_idc_version AS series_revised_idc_version, + dcm.SOPInstanceUID AS SOPInstanceUID, + aux.instance_uuid as crdc_instance_uuid, + aux.gcs_url as gcs_url, + aux.instance_size as instance_size, + aux.instance_hash as instance_hash, + aux.instance_init_idc_version AS instance_init_idc_version, + aux.instance_revised_idc_version AS instance_revised_idc_version, + aux.source_doi as Source_DOI, + aux.license_url as license_url, + aux.license_long_name as license_long_name, + aux.license_short_name as license_short_name, + dcm.* except(PatientID, StudyInstanceUID, SeriesInstanceUID, SOPInstanceUID) + FROM + `PROJECT.DATASET.auxiliary_metadata` AS aux + INNER JOIN + `PROJECT.DATASET.dicom_metadata` AS dcm + ON + aux.SOPInstanceUID = dcm.SOPInstanceUID) + + SELECT + data_collections.Location AS tcia_tumorLocation, + data_collections.Species AS tcia_species, + data_collections.CancerType AS tcia_cancerType, + pre_dicom_all.* + FROM + pre_dicom_all + INNER JOIN + `PROJECT.DATASET.original_collections_metadata` AS data_collections + ON + pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id diff --git a/datasets/idc/_images/generate_bq_views/queries/v4/measurement_groups.sql b/datasets/idc/_images/generate_bq_views/queries/v4/measurement_groups.sql new file mode 100644 index 000000000..df0a8f046 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v4/measurement_groups.sql @@ -0,0 +1,208 @@ +WITH + measurementGroups AS ( + WITH + contentSequenceLevel1 AS ( + WITH + structuredReports AS ( + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + ContentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + ( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) + AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 + AND ContentTemplateSequence [ + OFFSET + (0)].TemplateIdentifier = "1500" + AND ContentTemplateSequence [ + OFFSET + (0)].MappingResource = "DCMR" ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence + FROM + structuredReports + CROSS JOIN + UNNEST(ContentSequence) AS contentSequence ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence, + measurementGroup_number + FROM + contentSequenceLevel1 + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WITH + OFFSET + AS measurementGroup_number + WHERE + contentSequence.ValueType = "CONTAINER" + AND contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeMeaning = "Measurement Group" ), + measurementGroups_withTrackingID AS ( + SELECT + SOPInstanceUID, + PatientID, + SeriesDescription, + measurementGroup_number, + unnestedContentSequence.TextValue AS trackingIdentifier, + measurementGroups.contentSequence + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "TEXT" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112039" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withTrackingUID AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS trackingUniqueIdentifier + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112040" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withSegmentation AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSegmentNumber AS segmentationSegmentNumber + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "IMAGE" + AND unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), + measurementGroups_withSourceSeries AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS sourceSegmentedSeriesUID + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121232" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFinding AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS finding + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121071" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFindingSite AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS findingSite + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "G-C0E3" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "SRT" ) ) +SELECT + mWithUID.SOPInstanceUID, + mWithUID.measurementGroup_number, + mWithUID.trackingUniqueIdentifier, + mWithID.trackingIdentifier, + mWithID.PatientID, + mWithID.SeriesDescription, + mWithFinding.finding, + mWithFindingSite.findingSite, + mWithSourceSeries.sourceSegmentedSeriesUID, + mWithSegmentation.segmentationInstanceUID, + mWithSegmentation.segmentationSegmentNumber, + mWithID.contentSequence +FROM + measurementGroups_withTrackingUID AS mWithUID +JOIN + measurementGroups_withTrackingID AS mWithID + --- +ON + mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number +JOIN + measurementGroups_withFinding AS mWithFinding +ON + mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number +JOIN + measurementGroups_withFindingSite AS mWithFindingSite +ON + mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number +JOIN + measurementGroups_withSourceSeries AS mWithSourceSeries +ON + mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number +JOIN + measurementGroups_withSegmentation AS mWithSegmentation +ON + mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number + --- +ORDER BY + trackingUniqueIdentifier diff --git a/datasets/idc/_images/generate_bq_views/queries/v4/qualitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v4/qualitative_measurements.sql new file mode 100644 index 000000000..d020e4639 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v4/qualitative_measurements.sql @@ -0,0 +1,99 @@ +WITH + contentSequenceLevel3 AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + findingsAndFindingSites AS ( + WITH + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + ConceptCodeSequence AS finding + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + ConceptCodeSequence AS findingSite + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ) + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier, + findingSites.findingSite, + findingSites.measurementGroup_number + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) +SELECT + contentSequenceLevel3.PatientID, + contentSequenceLevel3.SOPInstanceUID, + findingsAndFindingSites.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3.ConceptNameCodeSequence AS Quantity, + contentSequenceLevel3.ConceptCodeSequence AS Value, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite +FROM + contentSequenceLevel3 +JOIN + findingsAndFindingSites +ON + contentSequenceLevel3.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3.measurementGroup_number = findingsAndFindingSites.measurementGroup_number +WHERE + # exclude + ( ConceptNameCodeSequence.CodeMeaning <> "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "DCM" ) AND # Finding + ( ConceptNameCodeSequence.CodeMeaning <> "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "SRT" ) # Finding Site + # correctness check: adding the below should result in a 36 rows column (4 segmented lesions, with 9 evaluations per each) + # AND + # contentSequenceLevel3.PatientID = "LIDC-IDRI-0001" diff --git a/datasets/idc/_images/generate_bq_views/queries/v4/quantitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v4/quantitative_measurements.sql new file mode 100644 index 000000000..4e71a043e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v4/quantitative_measurements.sql @@ -0,0 +1,177 @@ +WITH + --- + contentSequenceLevel3numeric AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)] AS MeasuredValueSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)].MeasurementUnitsCodeSequence [ + SAFE_OFFSET + (0)] AS MeasurementUnits, + contentSequence.ContentSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "NUM" ), + --- + contentSequenceLevel3codes AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + --- + contentSequenceLevel3uidrefs AS ( + SELECT + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence, + measurementGroup_number + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "UIDREF" + AND ConceptCodeSequence [ + SAFE_OFFSET + (0)].CodeMeaning = "Tracking Unique Identifier" ), + --- + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS finding, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + --- + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS findingSite, + measurementGroup_number + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ), + --- + findingsAndFindingSites AS ( + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findingSites.findingSite, + findingSites.measurementGroup_number, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) --- + # correctness check: the below should result in 11 rows (this is how many segments/measurement + # groups are there for each QIN-HEADNCK-01-0139 segmentation + #SELECT + # * + #FROM + # findingsAndFindingSites + #WHERE + # SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + --- + SELECT + contentSequenceLevel3numeric.PatientID, + contentSequenceLevel3numeric.SOPInstanceUID, + contentSequenceLevel3numeric.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3numeric.ConceptNameCodeSequence AS Quantity, + CASE ( ARRAY_LENGTH(contentSequenceLevel3numeric.ContentSequence) > 0 + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodeValue = "121401" + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodingSchemeDesignator = "DCM" ) + WHEN TRUE THEN STRUCT( contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeValue AS CodeValue, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodingSchemeDesignator AS CodingSchemeDesignator, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeMeaning AS CodeMeaning ) + ELSE + STRUCT(NULL as CodeValue,NULL as CodingSchemeDesignator,NULL as CodeMeaning) + END + AS derivationModifier, + SAFE_CAST( contentSequenceLevel3numeric.MeasuredValueSequence.NumericValue [ + SAFE_OFFSET + (0)] AS NUMERIC ) AS Value, + contentSequenceLevel3numeric.MeasurementUnits AS Units, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite + FROM + contentSequenceLevel3numeric + JOIN + findingsAndFindingSites + ON + contentSequenceLevel3numeric.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3numeric.measurementGroup_number = findingsAndFindingSites.measurementGroup_number --- + # correctness check: for this patient, there should be 12 rows: 4 segmented nodules, with 3 numeric evaluations for each + #WHERE + # contentSequenceLevel3numeric.PatientID = "LIDC-IDRI-0001" + --- + # correctness check: for this specific instance, there should be 238 rows (11 segments) + #WHERE + # contentSequenceLevel3numeric.SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + #where contentSequenceLevel3numeric.PatientID LIKE "%QIN%" diff --git a/datasets/idc/_images/generate_bq_views/queries/v4/segmentations.sql b/datasets/idc/_images/generate_bq_views/queries/v4/segmentations.sql new file mode 100644 index 000000000..cbbfceb54 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v4/segmentations.sql @@ -0,0 +1,86 @@ +# TODO: +# * account for SCT codes in addition to SRT +# * add Anatomic Region Modifier +# Debug: +#WITH +# segs_details AS ( +WITH + segs AS ( + SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ) +SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeValue AS CodeValue, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodingSchemeDesignator AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeMeaning AS CodeMeaning ) +END + AS AnatomicRegion, + CASE ( ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( unnested.AnatomicRegionSequence [ + OFFSET + (0)].AnatomicRegionModifierSequence ) > 0 ) + WHEN TRUE THEN unnested.AnatomicRegionSequence [ OFFSET (0)].AnatomicRegionModifierSequence [ OFFSET (0)] #unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL +END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyType, + #unnested.SegmentedPropertyTypeCodeSequence, + #unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID +FROM + segs +CROSS JOIN + UNNEST(SegmentSequence) AS unnested # correctness check: there should be 4 segmented nodules for this subject + #where PatientID = "LIDC-IDRI-0001" + # Note that it is possible to have some of those sequences empty! + # Debug: + #WHERE + # ARRAY_LENGTH(unnested.AnatomicRegionSequence) = 0 + # Debug: + # ) +#SELECT +# DISTINCT SegmentedPropertyTypeCodeSequence[ +#OFFSET +# (0)].CodeMeaning +#FROM +# segs_details +#WHERE +# ARRAY_LENGTH(SegmentedPropertyTypeCodeSequence) <> 0 diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/dicom_all.sql b/datasets/idc/_images/generate_bq_views/queries/v5/dicom_all.sql new file mode 100644 index 000000000..1b76d00f8 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/dicom_all.sql @@ -0,0 +1,55 @@ +WITH + pre_dicom_all AS ( + SELECT + aux.tcia_api_collection_id AS tcia_api_collection_id, + aux.idc_webapp_collection_id AS collection_id, + aux.collection_timestamp AS collection_timestamp, + aux.collection_hash as collection_hash, + aux.collection_init_idc_version AS collection_init_idc_version, + aux.collection_revised_idc_version AS collection_revised_idc_version, + aux.access AS access, + dcm.PatientID as PatientID, + aux.idc_case_id as idc_case_id, + aux.patient_hash as patient_hash, + aux.patient_init_idc_version AS patient_init_idc_version, + aux.patient_revised_idc_version AS patient_revised_idc_version, + dcm.StudyInstanceUID AS StudyInstanceUID, + aux.study_uuid as crdc_study_uuid, + aux.study_hash as study_hash, + aux.study_init_idc_version AS study_init_idc_version, + aux.study_revised_idc_version AS study_revised_idc_version, + dcm.SeriesInstanceUID AS SeriesInstanceUID, + aux.series_uuid as crdc_series_uuid, + aux.series_hash as series_hash, + aux.series_init_idc_version AS series_init_idc_version, + aux.series_revised_idc_version AS series_revised_idc_version, + dcm.SOPInstanceUID AS SOPInstanceUID, + aux.instance_uuid as crdc_instance_uuid, + aux.gcs_url as gcs_url, + aux.instance_size as instance_size, + aux.instance_hash as instance_hash, + aux.instance_init_idc_version AS instance_init_idc_version, + aux.instance_revised_idc_version AS instance_revised_idc_version, + aux.source_doi as Source_DOI, + aux.license_url as license_url, + aux.license_long_name as license_long_name, + aux.license_short_name as license_short_name, + dcm.* except(PatientID, StudyInstanceUID, SeriesInstanceUID, SOPInstanceUID) + FROM + `PROJECT.DATASET.auxiliary_metadata` AS aux + INNER JOIN + `PROJECT.DATASET.dicom_metadata` AS dcm + ON + aux.SOPInstanceUID = dcm.SOPInstanceUID) + + SELECT + data_collections.Location AS tcia_tumorLocation, + data_collections.Species AS tcia_species, + data_collections.CancerType AS tcia_cancerType, + pre_dicom_all.* + FROM + pre_dicom_all + INNER JOIN + `PROJECT.DATASET.original_collections_metadata` AS data_collections + ON + pre_dicom_all.collection_id = data_collections.idc_webapp_collection_id diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/dicom_metadata_curated.sql b/datasets/idc/_images/generate_bq_views/queries/v5/dicom_metadata_curated.sql new file mode 100644 index 000000000..bef536d74 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/dicom_metadata_curated.sql @@ -0,0 +1,5 @@ +SELECT +SOPInstanceUID, +SAFE_CAST(SliceThickness AS FLOAT64) AS SliceThickness +FROM +`PROJECT.DATASET.dicom_metadata` AS dcm diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/measurement_groups.sql b/datasets/idc/_images/generate_bq_views/queries/v5/measurement_groups.sql new file mode 100644 index 000000000..df0a8f046 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/measurement_groups.sql @@ -0,0 +1,208 @@ +WITH + measurementGroups AS ( + WITH + contentSequenceLevel1 AS ( + WITH + structuredReports AS ( + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + ContentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + ( SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.11" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.22" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.33" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.34" + OR SOPClassUID = "1.2.840.10008.5.1.4.1.1.88.35" ) + AND ARRAY_LENGTH(ContentTemplateSequence) <> 0 + AND ContentTemplateSequence [ + OFFSET + (0)].TemplateIdentifier = "1500" + AND ContentTemplateSequence [ + OFFSET + (0)].MappingResource = "DCMR" ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence + FROM + structuredReports + CROSS JOIN + UNNEST(ContentSequence) AS contentSequence ) + SELECT + PatientID, + SOPInstanceUID, + SeriesDescription, + contentSequence, + measurementGroup_number + FROM + contentSequenceLevel1 + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WITH + OFFSET + AS measurementGroup_number + WHERE + contentSequence.ValueType = "CONTAINER" + AND contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeMeaning = "Measurement Group" ), + measurementGroups_withTrackingID AS ( + SELECT + SOPInstanceUID, + PatientID, + SeriesDescription, + measurementGroup_number, + unnestedContentSequence.TextValue AS trackingIdentifier, + measurementGroups.contentSequence + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "TEXT" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112039" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withTrackingUID AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS trackingUniqueIdentifier + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "112040" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withSegmentation AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPInstanceUID AS segmentationInstanceUID, + unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSegmentNumber AS segmentationSegmentNumber + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "IMAGE" + AND unnestedContentSequence.ReferencedSOPSequence[ + OFFSET + (0)].ReferencedSOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ), + measurementGroups_withSourceSeries AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.UID AS sourceSegmentedSeriesUID + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "UIDREF" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121232" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFinding AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS finding + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "121071" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "DCM" ) ), + measurementGroups_withFindingSite AS ( + SELECT + SOPInstanceUID, + measurementGroup_number, + unnestedContentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS findingSite + FROM + measurementGroups + CROSS JOIN + UNNEST(contentSequence.ContentSequence) AS unnestedContentSequence + WHERE + unnestedContentSequence.ValueType = "CODE" + AND ( unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodeValue = "G-C0E3" + AND unnestedContentSequence.ConceptNameCodeSequence [ + OFFSET + (0)].CodingSchemeDesignator = "SRT" ) ) +SELECT + mWithUID.SOPInstanceUID, + mWithUID.measurementGroup_number, + mWithUID.trackingUniqueIdentifier, + mWithID.trackingIdentifier, + mWithID.PatientID, + mWithID.SeriesDescription, + mWithFinding.finding, + mWithFindingSite.findingSite, + mWithSourceSeries.sourceSegmentedSeriesUID, + mWithSegmentation.segmentationInstanceUID, + mWithSegmentation.segmentationSegmentNumber, + mWithID.contentSequence +FROM + measurementGroups_withTrackingUID AS mWithUID +JOIN + measurementGroups_withTrackingID AS mWithID + --- +ON + mWithID.SOPInstanceUID = mWithUID.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithUID.measurementGroup_number +JOIN + measurementGroups_withFinding AS mWithFinding +ON + mWithID.SOPInstanceUID = mWithFinding.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFinding.measurementGroup_number +JOIN + measurementGroups_withFindingSite AS mWithFindingSite +ON + mWithID.SOPInstanceUID = mWithFindingSite.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithFindingSite.measurementGroup_number +JOIN + measurementGroups_withSourceSeries AS mWithSourceSeries +ON + mWithID.SOPInstanceUID = mWithSourceSeries.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSourceSeries.measurementGroup_number +JOIN + measurementGroups_withSegmentation AS mWithSegmentation +ON + mWithID.SOPInstanceUID = mWithSegmentation.SOPInstanceUID + AND mWithID.measurementGroup_number = mWithSegmentation.measurementGroup_number + --- +ORDER BY + trackingUniqueIdentifier diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/qualitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v5/qualitative_measurements.sql new file mode 100644 index 000000000..d020e4639 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/qualitative_measurements.sql @@ -0,0 +1,99 @@ +WITH + contentSequenceLevel3 AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + findingsAndFindingSites AS ( + WITH + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + ConceptCodeSequence AS finding + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + ConceptCodeSequence AS findingSite + FROM + contentSequenceLevel3 + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ) + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier, + findingSites.findingSite, + findingSites.measurementGroup_number + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) +SELECT + contentSequenceLevel3.PatientID, + contentSequenceLevel3.SOPInstanceUID, + findingsAndFindingSites.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3.ConceptNameCodeSequence AS Quantity, + contentSequenceLevel3.ConceptCodeSequence AS Value, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite +FROM + contentSequenceLevel3 +JOIN + findingsAndFindingSites +ON + contentSequenceLevel3.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3.measurementGroup_number = findingsAndFindingSites.measurementGroup_number +WHERE + # exclude + ( ConceptNameCodeSequence.CodeMeaning <> "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "DCM" ) AND # Finding + ( ConceptNameCodeSequence.CodeMeaning <> "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator <> "SRT" ) # Finding Site + # correctness check: adding the below should result in a 36 rows column (4 segmented lesions, with 9 evaluations per each) + # AND + # contentSequenceLevel3.PatientID = "LIDC-IDRI-0001" diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/quantitative_measurements.sql b/datasets/idc/_images/generate_bq_views/queries/v5/quantitative_measurements.sql new file mode 100644 index 000000000..4e71a043e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/quantitative_measurements.sql @@ -0,0 +1,177 @@ +WITH + --- + contentSequenceLevel3numeric AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)] AS MeasuredValueSequence, + contentSequence.MeasuredValueSequence [ + SAFE_OFFSET + (0)].MeasurementUnitsCodeSequence [ + SAFE_OFFSET + (0)] AS MeasurementUnits, + contentSequence.ContentSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "NUM" ), + --- + contentSequenceLevel3codes AS ( + SELECT + PatientID, + SOPInstanceUID, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "CODE" ), + --- + contentSequenceLevel3uidrefs AS ( + SELECT + contentSequence.ConceptNameCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptNameCodeSequence, + contentSequence.ConceptCodeSequence [ + SAFE_OFFSET + (0)] AS ConceptCodeSequence, + measurementGroup_number + FROM + `PROJECT.DATASET.measurement_groups` + CROSS JOIN + UNNEST (contentSequence.ContentSequence) AS contentSequence + WHERE + contentSequence.ValueType = "UIDREF" + AND ConceptCodeSequence [ + SAFE_OFFSET + (0)].CodeMeaning = "Tracking Unique Identifier" ), + --- + findings AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS finding, + measurementGroup_number, + segmentationInstanceUID, + segmentationSegmentNumber, + sourceSegmentedSeriesUID, + trackingIdentifier, + trackingUniqueIdentifier, + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "121071" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "DCM" ), + --- + findingSites AS ( + SELECT + PatientID, + SOPInstanceUID, + ConceptCodeSequence AS findingSite, + measurementGroup_number + FROM + contentSequenceLevel3codes + WHERE + ConceptNameCodeSequence.CodeValue = "G-C0E3" + AND ConceptNameCodeSequence.CodingSchemeDesignator = "SRT" ), + --- + findingsAndFindingSites AS ( + SELECT + findings.PatientID, + findings.SOPInstanceUID, + findings.finding, + findingSites.findingSite, + findingSites.measurementGroup_number, + findings.segmentationInstanceUID, + findings.segmentationSegmentNumber, + findings.sourceSegmentedSeriesUID, + findings.trackingIdentifier, + findings.trackingUniqueIdentifier + FROM + findings + JOIN + findingSites + ON + findings.SOPInstanceUID = findingSites.SOPInstanceUID + AND findings.measurementGroup_number = findingSites.measurementGroup_number ) --- + # correctness check: the below should result in 11 rows (this is how many segments/measurement + # groups are there for each QIN-HEADNCK-01-0139 segmentation + #SELECT + # * + #FROM + # findingsAndFindingSites + #WHERE + # SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + --- + SELECT + contentSequenceLevel3numeric.PatientID, + contentSequenceLevel3numeric.SOPInstanceUID, + contentSequenceLevel3numeric.measurementGroup_number, + findingsAndFindingSites.segmentationInstanceUID, + findingsAndFindingSites.segmentationSegmentNumber, + findingsAndFindingSites.sourceSegmentedSeriesUID, + findingsAndFindingSites.trackingIdentifier, + findingsAndFindingSites.trackingUniqueIdentifier, + contentSequenceLevel3numeric.ConceptNameCodeSequence AS Quantity, + CASE ( ARRAY_LENGTH(contentSequenceLevel3numeric.ContentSequence) > 0 + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodeValue = "121401" + AND contentSequenceLevel3numeric.ContentSequence [ + SAFE_OFFSET + (0)].ConceptNameCodeSequence [ + SAFE_OFFSET + (0)].CodingSchemeDesignator = "DCM" ) + WHEN TRUE THEN STRUCT( contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeValue AS CodeValue, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodingSchemeDesignator AS CodingSchemeDesignator, contentSequenceLevel3numeric.ContentSequence [ SAFE_OFFSET (0)].ConceptCodeSequence [ SAFE_OFFSET (0)].CodeMeaning AS CodeMeaning ) + ELSE + STRUCT(NULL as CodeValue,NULL as CodingSchemeDesignator,NULL as CodeMeaning) + END + AS derivationModifier, + SAFE_CAST( contentSequenceLevel3numeric.MeasuredValueSequence.NumericValue [ + SAFE_OFFSET + (0)] AS NUMERIC ) AS Value, + contentSequenceLevel3numeric.MeasurementUnits AS Units, + findingsAndFindingSites.finding, + findingsAndFindingSites.findingSite + FROM + contentSequenceLevel3numeric + JOIN + findingsAndFindingSites + ON + contentSequenceLevel3numeric.SOPInstanceUID = findingsAndFindingSites.SOPInstanceUID + AND contentSequenceLevel3numeric.measurementGroup_number = findingsAndFindingSites.measurementGroup_number --- + # correctness check: for this patient, there should be 12 rows: 4 segmented nodules, with 3 numeric evaluations for each + #WHERE + # contentSequenceLevel3numeric.PatientID = "LIDC-IDRI-0001" + --- + # correctness check: for this specific instance, there should be 238 rows (11 segments) + #WHERE + # contentSequenceLevel3numeric.SOPInstanceUID = "1.2.276.0.7230010.3.1.4.8323329.18336.1440004659.731760" + #where contentSequenceLevel3numeric.PatientID LIKE "%QIN%" diff --git a/datasets/idc/_images/generate_bq_views/queries/v5/segmentations.sql b/datasets/idc/_images/generate_bq_views/queries/v5/segmentations.sql new file mode 100644 index 000000000..cbbfceb54 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/queries/v5/segmentations.sql @@ -0,0 +1,86 @@ +# TODO: +# * account for SCT codes in addition to SRT +# * add Anatomic Region Modifier +# Debug: +#WITH +# segs_details AS ( +WITH + segs AS ( + SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + SegmentSequence + FROM + `PROJECT.DATASET.dicom_metadata` + WHERE + # more reliable than Modality = "SEG" + SOPClassUID = "1.2.840.10008.5.1.4.1.1.66.4" ) +SELECT + PatientID, + SOPInstanceUID, + FrameOfReferenceUID, + CASE ARRAY_LENGTH(unnested.AnatomicRegionSequence) + WHEN 0 THEN NULL + ELSE + STRUCT( unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeValue AS CodeValue, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodingSchemeDesignator AS CodingSchemeDesignator, + unnested.AnatomicRegionSequence [ + OFFSET + (0)].CodeMeaning AS CodeMeaning ) +END + AS AnatomicRegion, + CASE ( ARRAY_LENGTH(unnested.AnatomicRegionSequence) > 0 + AND ARRAY_LENGTH( unnested.AnatomicRegionSequence [ + OFFSET + (0)].AnatomicRegionModifierSequence ) > 0 ) + WHEN TRUE THEN unnested.AnatomicRegionSequence [ OFFSET (0)].AnatomicRegionModifierSequence [ OFFSET (0)] #unnested.AnatomicRegionSequence[OFFSET(0)].AnatomicRegionModifierSequence, + ELSE + NULL +END + AS AnatomicRegionModifier, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyCategoryCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyCategoryCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyCategory, + CASE ARRAY_LENGTH(unnested.SegmentedPropertyTypeCodeSequence) + WHEN 0 THEN NULL + ELSE + unnested.SegmentedPropertyTypeCodeSequence [ +OFFSET + (0)] +END + AS SegmentedPropertyType, + #unnested.SegmentedPropertyTypeCodeSequence, + #unnested.SegmentedPropertyTypeModifierCodeSequence, + unnested.SegmentAlgorithmType, + unnested.SegmentNumber, + unnested.TrackingUID, + unnested.TrackingID +FROM + segs +CROSS JOIN + UNNEST(SegmentSequence) AS unnested # correctness check: there should be 4 segmented nodules for this subject + #where PatientID = "LIDC-IDRI-0001" + # Note that it is possible to have some of those sequences empty! + # Debug: + #WHERE + # ARRAY_LENGTH(unnested.AnatomicRegionSequence) = 0 + # Debug: + # ) +#SELECT +# DISTINCT SegmentedPropertyTypeCodeSequence[ +#OFFSET +# (0)].CodeMeaning +#FROM +# segs_details +#WHERE +# ARRAY_LENGTH(SegmentedPropertyTypeCodeSequence) <> 0 diff --git a/datasets/idc/_images/generate_bq_views/requirements.txt b/datasets/idc/_images/generate_bq_views/requirements.txt new file mode 100644 index 000000000..3700b272e --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/requirements.txt @@ -0,0 +1 @@ +google-cloud-bigquery diff --git a/datasets/idc/_images/generate_bq_views/script.py b/datasets/idc/_images/generate_bq_views/script.py new file mode 100644 index 000000000..89c6913b4 --- /dev/null +++ b/datasets/idc/_images/generate_bq_views/script.py @@ -0,0 +1,68 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import os +import pathlib +import typing + +from google.cloud import bigquery + + +def main( + queries_dir: pathlib.Path, + gcp_project: str, + dataset_name: str, + dataset_versions: typing.List[str], +): + client = bigquery.Client() + for version in dataset_versions: + sql_files = [f for f in (queries_dir / version).iterdir() if f.suffix == ".sql"] + for sql_file in sql_files: + query = load_query( + sql_file=sql_file, + gcp_project=gcp_project, + dataset=f"{dataset_name}_{version}", + ) + + client.query(query) + + +def load_query(sql_file: pathlib.Path, gcp_project: str, dataset: str) -> str: + query = sql_file.read_text() + + # Replace template variables + query = query.replace("PROJECT", gcp_project) + query = query.replace("DATASET", dataset) + query = f""" + CREATE OR REPLACE VIEW + `{gcp_project}.{dataset}.{sql_file.stem}` + AS ( + {query} + ) + """ + + return query + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + queries_dir=pathlib.Path(os.environ["QUERIES_DIR"]).expanduser(), + gcp_project=os.environ["GCP_PROJECT"], + dataset_name=os.environ["DATASET_NAME"], + dataset_versions=json.loads(os.environ["DATASET_VERSIONS"]), + ) diff --git a/datasets/idc/_terraform/idc_dataset.tf b/datasets/idc/_terraform/idc_dataset.tf new file mode 100644 index 000000000..6ad803476 --- /dev/null +++ b/datasets/idc/_terraform/idc_dataset.tf @@ -0,0 +1,77 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_storage_bucket" "idc" { + name = "${var.bucket_name_prefix}-idc" + force_destroy = true + location = "US" + uniform_bucket_level_access = true +} + +output "storage_bucket-idc-name" { + value = google_storage_bucket.idc.name +} + +resource "google_bigquery_dataset" "idc_v1" { + dataset_id = "idc_v1" + project = var.project_id + description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v1 data" +} + +output "bigquery_dataset-idc_v1-dataset_id" { + value = google_bigquery_dataset.idc_v1.dataset_id +} + +resource "google_bigquery_dataset" "idc_v2" { + dataset_id = "idc_v2" + project = var.project_id + description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v2 data" +} + +output "bigquery_dataset-idc_v2-dataset_id" { + value = google_bigquery_dataset.idc_v2.dataset_id +} + +resource "google_bigquery_dataset" "idc_v3" { + dataset_id = "idc_v3" + project = var.project_id + description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v3 data" +} + +output "bigquery_dataset-idc_v3-dataset_id" { + value = google_bigquery_dataset.idc_v3.dataset_id +} + +resource "google_bigquery_dataset" "idc_v4" { + dataset_id = "idc_v4" + project = var.project_id + description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v4 data" +} + +output "bigquery_dataset-idc_v4-dataset_id" { + value = google_bigquery_dataset.idc_v4.dataset_id +} + +resource "google_bigquery_dataset" "idc_v5" { + dataset_id = "idc_v5" + project = var.project_id + description = "Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v5 data" +} + +output "bigquery_dataset-idc_v5-dataset_id" { + value = google_bigquery_dataset.idc_v5.dataset_id +} diff --git a/datasets/idc/_terraform/provider.tf b/datasets/idc/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/idc/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/idc/_terraform/variables.tf b/datasets/idc/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/idc/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/idc/copy_tcia_data/copy_tcia_data_dag.py b/datasets/idc/copy_tcia_data/copy_tcia_data_dag.py new file mode 100644 index 000000000..ae003ee0f --- /dev/null +++ b/datasets/idc/copy_tcia_data/copy_tcia_data_dag.py @@ -0,0 +1,117 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.operators import cloud_storage_transfer_service + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-11-17", +} + + +with DAG( + dag_id="idc.copy_tcia_data", + default_args=default_args, + max_active_runs=1, + schedule_interval="@monthly", + catchup=False, + default_view="graph", +) as dag: + + # Task to run a GCS to GCS operation using Google resources + copy_gcs_bucket = ( + cloud_storage_transfer_service.CloudDataTransferServiceGCSToGCSOperator( + task_id="copy_gcs_bucket", + timeout=43200, + retries=0, + wait=True, + project_id="bigquery-public-data", + source_bucket="{{ var.json.idc.source_bucket }}", + destination_bucket="{{ var.json.idc.destination_bucket}}", + google_impersonation_chain="{{ var.json.idc.service_account }}", + ) + ) + + # Transfer IDC Databases + copy_bq_datasets = kubernetes_pod.KubernetesPodOperator( + task_id="copy_bq_datasets", + name="copy_bq_datasets", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.idc.container_registry.copy_bq_datasets }}", + env_vars={ + "SOURCE_PROJECT_ID": "{{ var.json.idc.source_project_id }}", + "TARGET_PROJECT_ID": "{{ var.json.idc.target_project_id }}", + "SERVICE_ACCOUNT": "{{ var.json.idc.service_account }}", + "DATASET_NAME": "idc", + "DATASET_VERSIONS": '["v1", "v2", "v3", "v4", "v5"]', + }, + resources={"limit_memory": "128M", "limit_cpu": "200m"}, + ) + + # Generate BQ views + generate_bq_views = kubernetes_pod.KubernetesPodOperator( + task_id="generate_bq_views", + name="generate_bq_views", + namespace="default", + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": ["pool-e2-standard-4"], + } + ] + } + ] + } + } + }, + image_pull_policy="Always", + image="{{ var.json.idc.container_registry.generate_bq_views }}", + env_vars={ + "QUERIES_DIR": "/custom/queries", + "GCP_PROJECT": "{{ var.value.gcp_project }}", + "DATASET_NAME": "idc", + "DATASET_VERSIONS": '["v1", "v2", "v3", "v4", "v5"]', + }, + resources={"limit_memory": "128M", "limit_cpu": "200m"}, + ) + + copy_gcs_bucket >> copy_bq_datasets >> generate_bq_views diff --git a/datasets/idc/copy_tcia_data/pipeline.yaml b/datasets/idc/copy_tcia_data/pipeline.yaml new file mode 100644 index 000000000..bd47e5694 --- /dev/null +++ b/datasets/idc/copy_tcia_data/pipeline.yaml @@ -0,0 +1,101 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +resources: ~ + +dag: + airflow_version: 2 + initialize: + dag_id: copy_tcia_data + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-11-17' + max_active_runs: 1 + schedule_interval: "@monthly" + catchup: False + default_view: graph + + tasks: + - operator: "CloudDataTransferServiceGCSToGCSOperator" + description: "Task to run a GCS to GCS operation using Google resources" + args: + task_id: copy_gcs_bucket + timeout: 43200 # 12 hours + retries: 0 + wait: True + project_id: bigquery-public-data + source_bucket: "{{ var.json.idc.source_bucket }}" + destination_bucket: "{{ var.json.idc.destination_bucket}}" + google_impersonation_chain: "{{ var.json.idc.service_account }}" + + - operator: "KubernetesPodOperator" + description: "Transfer IDC Databases" + args: + task_id: "copy_bq_datasets" + name: "copy_bq_datasets" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.idc.container_registry.copy_bq_datasets }}" + env_vars: + SOURCE_PROJECT_ID: "{{ var.json.idc.source_project_id }}" + TARGET_PROJECT_ID: "{{ var.json.idc.target_project_id }}" + SERVICE_ACCOUNT: "{{ var.json.idc.service_account }}" + DATASET_NAME: "idc" + DATASET_VERSIONS: >- + ["v1", "v2", "v3", "v4", "v5"] + resources: + limit_memory: "128M" + limit_cpu: "200m" + + - operator: "KubernetesPodOperator" + description: "Generate BQ views" + args: + task_id: "generate_bq_views" + name: "generate_bq_views" + namespace: "default" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-nodepool + operator: In + values: + - "pool-e2-standard-4" + image_pull_policy: "Always" + image: "{{ var.json.idc.container_registry.generate_bq_views }}" + env_vars: + QUERIES_DIR: "/custom/queries" + GCP_PROJECT: "{{ var.value.gcp_project }}" + DATASET_NAME: "idc" + DATASET_VERSIONS: >- + ["v1", "v2", "v3", "v4", "v5"] + resources: + limit_memory: "128M" + limit_cpu: "200m" + + graph_paths: + - "copy_gcs_bucket >> copy_bq_datasets >> generate_bq_views" diff --git a/datasets/idc/dataset.yaml b/datasets/idc/dataset.yaml new file mode 100644 index 000000000..40f5c96e0 --- /dev/null +++ b/datasets/idc/dataset.yaml @@ -0,0 +1,48 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: idc + friendly_name: ~ + description: ~ + dataset_sources: ~ + terms_of_use: ~ + + +resources: + + - type: storage_bucket + name: "idc" + uniform_bucket_level_access: True + location: US + + - type: bigquery_dataset + dataset_id: idc_v1 + description: Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v1 data + + - type: bigquery_dataset + dataset_id: idc_v2 + description: Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v2 data + + - type: bigquery_dataset + dataset_id: idc_v3 + description: Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v3 data + + - type: bigquery_dataset + dataset_id: idc_v4 + description: Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v4 data + + - type: bigquery_dataset + dataset_id: idc_v5 + description: Imaging Data Commons (IDC) - The Cancer Imaging Archive (TCIA) v5 data