diff --git a/.github/header-checker-lint.yml b/.github/header-checker-lint.yml index 0dcaff7d0df6..8e74f16e8cef 100644 --- a/.github/header-checker-lint.yml +++ b/.github/header-checker-lint.yml @@ -20,6 +20,9 @@ ignoreFiles: - "texttospeech/snippets/resources/hello.txt" - "language/**/resources/*.txt" - "language/snippets/classify_text/resources/texts/*.txt" + - "dlp/snippets/resources/accounts.txt" + - "dlp/snippets/resources/harmless.txt" + - "dlp/snippets/resources/test.txt" ignoreLicenseYear: true diff --git a/dlp/AUTHORING_GUIDE.md b/dlp/AUTHORING_GUIDE.md new file mode 100644 index 000000000000..55c97b32f4c1 --- /dev/null +++ b/dlp/AUTHORING_GUIDE.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md \ No newline at end of file diff --git a/dlp/CONTRIBUTING.md b/dlp/CONTRIBUTING.md new file mode 100644 index 000000000000..34c882b6f1a3 --- /dev/null +++ b/dlp/CONTRIBUTING.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/CONTRIBUTING.md \ No newline at end of file diff --git a/dlp/README.md b/dlp/README.md deleted file mode 100644 index df1718bc765b..000000000000 --- a/dlp/README.md +++ /dev/null @@ -1,3 +0,0 @@ -These samples have been moved. - -https://github.com/googleapis/python-dlp/tree/main/samples diff --git a/dlp/snippets/custom_infotype.py b/dlp/snippets/custom_infotype.py new file mode 100644 index 000000000000..4152b233ebe4 --- /dev/null +++ b/dlp/snippets/custom_infotype.py @@ -0,0 +1,873 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom infoType snippets. + +This file contains sample code that uses the Data Loss Prevention API to create +custom infoType detectors to refine scan results. +""" + + +# [START dlp_inspect_string_with_exclusion_dict] +def inspect_string_with_exclusion_dict( + project, content_string, exclusion_list=["example@example.com"] +): + """Inspects the provided text, avoiding matches specified in the exclusion list + + Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they are + in the specified exclusion list. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + exclusion_list: The list of strings to ignore matches on + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "EMAIL_ADDRESS"}] + + # Construct a rule set that will only match on EMAIL_ADDRESS + # if the match text is not in the exclusion list. + rule_set = [ + { + "info_types": info_types_to_locate, + "rules": [ + { + "exclusion_rule": { + "dictionary": {"word_list": {"words": exclusion_list}}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": info_types_to_locate, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_with_exclusion_dict] + + +# [START dlp_inspect_string_with_exclusion_regex] +def inspect_string_with_exclusion_regex( + project, content_string, exclusion_regex=".+@example.com" +): + """Inspects the provided text, avoiding matches specified in the exclusion regex + + Uses the Data Loss Prevention API to omit matches on EMAIL_ADDRESS if they match + the specified exclusion regex. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + exclusion_regex: The regular expression to exclude matches on + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "EMAIL_ADDRESS"}] + + # Construct a rule set that will only match on EMAIL_ADDRESS + # if the specified regex doesn't also match. + rule_set = [ + { + "info_types": info_types_to_locate, + "rules": [ + { + "exclusion_rule": { + "regex": {"pattern": exclusion_regex}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": info_types_to_locate, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_with_exclusion_regex] + + +# [START dlp_inspect_string_with_exclusion_dict_substring] +def inspect_string_with_exclusion_dict_substring( + project, content_string, exclusion_list=["TEST"] +): + """Inspects the provided text, avoiding matches that contain excluded tokens + + Uses the Data Loss Prevention API to omit matches if they include tokens + in the specified exclusion list. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + exclusion_list: The list of strings to ignore partial matches on + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "EMAIL_ADDRESS"}, {"name": "DOMAIN_NAME"}] + + # Construct a rule set that will only match if the match text does not + # contains tokens from the exclusion list. + rule_set = [ + { + "info_types": info_types_to_locate, + "rules": [ + { + "exclusion_rule": { + "dictionary": {"word_list": {"words": exclusion_list}}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": info_types_to_locate, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_with_exclusion_dict_substring] + + +# [START dlp_inspect_string_custom_excluding_substring] +def inspect_string_custom_excluding_substring( + project, content_string, exclusion_list=["jimmy"] +): + """Inspects the provided text with a custom detector, avoiding matches on specific tokens + + Uses the Data Loss Prevention API to omit matches on a custom detector + if they include tokens in the specified exclusion list. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + exclusion_list: The list of strings to ignore matches on + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector for names + custom_info_types = [ + { + "info_type": {"name": "CUSTOM_NAME_DETECTOR"}, + "regex": {"pattern": "[A-Z][a-z]{1,15}, [A-Z][a-z]{1,15}"}, + } + ] + + # Construct a rule set that will only match if the match text does not + # contains tokens from the exclusion list. + rule_set = [ + { + "info_types": [{"name": "CUSTOM_NAME_DETECTOR"}], + "rules": [ + { + "exclusion_rule": { + "dictionary": {"word_list": {"words": exclusion_list}}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "custom_info_types": custom_info_types, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_custom_excluding_substring] + + +# [START dlp_inspect_string_custom_omit_overlap] +def inspect_string_custom_omit_overlap(project, content_string): + """Matches PERSON_NAME and a custom detector, + but if they overlap only matches the custom detector + + Uses the Data Loss Prevention API to omit matches on a built-in detector + if they overlap with matches from a custom detector + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector for names + custom_info_types = [ + { + "info_type": {"name": "VIP_DETECTOR"}, + "regex": {"pattern": "Larry Page|Sergey Brin"}, + "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType.EXCLUSION_TYPE_EXCLUDE, + } + ] + + # Construct a rule set that will exclude PERSON_NAME matches + # that overlap with VIP_DETECTOR matches + rule_set = [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [ + { + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{"name": "VIP_DETECTOR"}] + }, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_FULL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": [{"name": "PERSON_NAME"}], + "custom_info_types": custom_info_types, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_custom_omit_overlap] + + +# [START dlp_omit_name_if_also_email] +def omit_name_if_also_email( + project, + content_string, +): + """Matches PERSON_NAME and EMAIL_ADDRESS, but not both. + + Uses the Data Loss Prevention API omit matches on PERSON_NAME if the + EMAIL_ADDRESS detector also matches. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] + + # Construct the configuration dictionary that will only match on PERSON_NAME + # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce + # the total number of findings when there is a large overlap between different + # infoTypes. + inspect_config = { + "info_types": info_types_to_locate, + "rule_set": [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [ + { + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{"name": "EMAIL_ADDRESS"}] + }, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + } + ], + } + ], + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_omit_name_if_also_email] + + +# [START dlp_inspect_string_without_overlap] +def inspect_string_without_overlap(project, content_string): + """Matches EMAIL_ADDRESS and DOMAIN_NAME, but DOMAIN_NAME is omitted + if it overlaps with EMAIL_ADDRESS + + Uses the Data Loss Prevention API to omit matches of one infotype + that overlap with another. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "DOMAIN_NAME"}, {"name": "EMAIL_ADDRESS"}] + + # Define a custom info type to exclude email addresses + custom_info_types = [ + { + "info_type": {"name": "EMAIL_ADDRESS"}, + "exclusion_type": google.cloud.dlp_v2.CustomInfoType.ExclusionType.EXCLUSION_TYPE_EXCLUDE, + } + ] + + # Construct a rule set that will exclude DOMAIN_NAME matches + # that overlap with EMAIL_ADDRESS matches + rule_set = [ + { + "info_types": [{"name": "DOMAIN_NAME"}], + "rules": [ + { + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{"name": "EMAIL_ADDRESS"}] + }, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + } + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": info_types_to_locate, + "custom_info_types": custom_info_types, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_without_overlap] + + +# [START dlp_inspect_with_person_name_w_custom_hotword] +def inspect_with_person_name_w_custom_hotword( + project, content_string, custom_hotword="patient" +): + """Uses the Data Loss Prevention API increase likelihood for matches on + PERSON_NAME if the user specified custom hotword is present. Only + includes findings with the increased likelihood by setting a minimum + likelihood threshold of VERY_LIKELY. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + custom_hotword: The custom hotword used for likelihood boosting. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a rule set with caller provided hotword, with a likelihood + # boost to VERY_LIKELY when the hotword are present within the 50 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": {"pattern": custom_hotword}, + "likelihood_adjustment": { + "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY + }, + "proximity": {"window_before": 50}, + } + + rule_set = [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "rule_set": rule_set, + "min_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_with_person_name_w_custom_hotword] + + +# [START dlp_inspect_string_multiple_rules] +def inspect_string_multiple_rules(project, content_string): + """Uses the Data Loss Prevention API to modify likelihood for matches on + PERSON_NAME combining multiple hotword and exclusion rules. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct hotword rules + patient_rule = { + "hotword_regex": {"pattern": "patient"}, + "proximity": {"window_before": 10}, + "likelihood_adjustment": { + "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY + }, + } + doctor_rule = { + "hotword_regex": {"pattern": "doctor"}, + "proximity": {"window_before": 10}, + "likelihood_adjustment": { + "fixed_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY + }, + } + + # Construct exclusion rules + quasimodo_rule = { + "dictionary": {"word_list": {"words": ["quasimodo"]}}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + redacted_rule = { + "regex": {"pattern": "REDACTED"}, + "matching_type": google.cloud.dlp_v2.MatchingType.MATCHING_TYPE_PARTIAL_MATCH, + } + + # Construct the rule set, combining the above rules + rule_set = [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [ + {"hotword_rule": patient_rule}, + {"hotword_rule": doctor_rule}, + {"exclusion_rule": quasimodo_rule}, + {"exclusion_rule": redacted_rule}, + ], + } + ] + + # Construct the configuration dictionary + inspect_config = { + "info_types": [{"name": "PERSON_NAME"}], + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_string_multiple_rules] + + +# [START dlp_inspect_with_medical_record_number_custom_regex_detector] +def inspect_with_medical_record_number_custom_regex_detector( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE, + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_with_medical_record_number_custom_regex_detector] + + +# [START dlp_inspect_with_medical_record_number_w_custom_hotwords] +def inspect_with_medical_record_number_w_custom_hotwords( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector, with custom hotwords rules to boost finding + certainty under some circumstances. + + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": google.cloud.dlp_v2.Likelihood.POSSIBLE, + } + ] + + # Construct a rule set with hotwords "mrn" and "medical", with a likelohood + # boost to VERY_LIKELY when hotwords are present within the 10 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": {"pattern": "(?i)(mrn|medical)(?-i)"}, + "likelihood_adjustment": { + "fixed_likelihood": google.cloud.dlp_v2.Likelihood.VERY_LIKELY + }, + "proximity": {"window_before": 10}, + } + + rule_set = [ + {"info_types": [{"name": "C_MRN"}], "rules": [{"hotword_rule": hotword_rule}]} + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + "rule_set": rule_set, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print(f"Quote: {finding.quote}") + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + + +# [END dlp_inspect_with_medical_record_number_w_custom_hotwords] diff --git a/dlp/snippets/custom_infotype_test.py b/dlp/snippets/custom_infotype_test.py new file mode 100644 index 000000000000..13c5e3275427 --- /dev/null +++ b/dlp/snippets/custom_infotype_test.py @@ -0,0 +1,162 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import custom_infotype + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_inspect_string_with_exclusion_dict(capsys): + custom_infotype.inspect_string_with_exclusion_dict( + GCLOUD_PROJECT, "gary@example.com, example@example.com", ["example@example.com"] + ) + + out, _ = capsys.readouterr() + assert "example@example.com" not in out + assert "gary@example.com" in out + + +def test_inspect_string_with_exclusion_regex(capsys): + custom_infotype.inspect_string_with_exclusion_regex( + GCLOUD_PROJECT, "alice@example.com, ironman@avengers.net", ".+@example.com" + ) + + out, _ = capsys.readouterr() + assert "alice" not in out + assert "ironman" in out + + +def test_inspect_string_with_exclusion_dict_substring(capsys): + custom_infotype.inspect_string_with_exclusion_dict_substring( + GCLOUD_PROJECT, "bob@example.com TEST@example.com TEST.com", ["TEST"] + ) + + out, _ = capsys.readouterr() + assert "TEST@example.com" not in out + assert "TEST.com" not in out + assert "bob@example.com" in out + + +def test_inspect_string_custom_excluding_substring(capsys): + custom_infotype.inspect_string_custom_excluding_substring( + GCLOUD_PROJECT, "Danger, Jimmy | Wayne, Bruce", ["Jimmy"] + ) + + out, _ = capsys.readouterr() + assert "Wayne, Bruce" in out + assert "Danger, Jimmy" not in out + + +def test_inspect_string_custom_omit_overlap(capsys): + custom_infotype.inspect_string_custom_omit_overlap( + GCLOUD_PROJECT, "Larry Page and John Doe" + ) + + out, _ = capsys.readouterr() + assert "Larry Page" not in out + assert "John Doe" in out + + +def test_omit_name_if_also_email(capsys): + custom_infotype.omit_name_if_also_email(GCLOUD_PROJECT, "alice@example.com") + + # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME. + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + assert "Info type: PERSON_NAME" not in out + + +def test_inspect_string_without_overlap(capsys): + custom_infotype.inspect_string_without_overlap( + GCLOUD_PROJECT, "example.com is a domain, james@example.org is an email." + ) + + out, _ = capsys.readouterr() + assert "example.com" in out + assert "example.org" not in out + + +def test_inspect_with_person_name_w_custom_hotword(capsys): + custom_infotype.inspect_with_person_name_w_custom_hotword( + GCLOUD_PROJECT, "patient's name is John Doe.", "patient" + ) + + out, _ = capsys.readouterr() + assert "Info type: PERSON_NAME" in out + assert "Likelihood: 5" in out + + +def test_inspect_string_multiple_rules_patient(capsys): + custom_infotype.inspect_string_multiple_rules( + GCLOUD_PROJECT, "patient name: Jane Doe" + ) + + out, _ = capsys.readouterr() + assert "Likelihood: 4" in out + + +def test_inspect_string_multiple_rules_doctor(capsys): + custom_infotype.inspect_string_multiple_rules(GCLOUD_PROJECT, "doctor: Jane Doe") + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_string_multiple_rules_quasimodo(capsys): + custom_infotype.inspect_string_multiple_rules( + GCLOUD_PROJECT, "patient name: quasimodo" + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_string_multiple_rules_redacted(capsys): + custom_infotype.inspect_string_multiple_rules( + GCLOUD_PROJECT, "name of patient: REDACTED" + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_with_medical_record_number_custom_regex_detector(capsys): + custom_infotype.inspect_with_medical_record_number_custom_regex_detector( + GCLOUD_PROJECT, "Patients MRN 444-5-22222" + ) + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords(capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "just a number 444-5-22222" + ) + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 3" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords(capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "Patients MRN 444-5-22222" + ) + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 5" in out diff --git a/dlp/snippets/deid.py b/dlp/snippets/deid.py new file mode 100644 index 000000000000..3e6968ff786b --- /dev/null +++ b/dlp/snippets/deid.py @@ -0,0 +1,1228 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Uses of the Data Loss Prevention API for deidentifying sensitive data.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_deidentify_masking] +def deidentify_with_mask( + project, input_str, info_types, masking_character=None, number_to_mask=0 +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by masking it with a character. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + masking_character: The character to mask matching sensitive data with. + number_to_mask: The maximum number of sensitive characters to mask in + a match. If omitted or set to zero, the API will default to no + maximum. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "character_mask_config": { + "masking_character": masking_character, + "number_to_mask": number_to_mask, + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_masking] + +# [START dlp_deidentify_redact] +def deidentify_with_redact( + project, + input_str, + info_types, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by redacting matched input values. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [{"primitive_transformation": {"redact_config": {}}}] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_redact] + +# [START dlp_deidentify_replace] +def deidentify_with_replace( + project, + input_str, + info_types, + replacement_str="REPLACEMENT_STR", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing matched input values with a value you specify. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + replacement_str: The string to replace all values that match given + info types. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_config": { + "new_value": {"string_value": replacement_str} + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_replace] + +# [START dlp_deidentify_fpe] + + +def deidentify_with_fpe( + project, + input_str, + info_types, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to use. Only + necessary if you want to reverse the deidentification process. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + "crypto_key": { + "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} + }, + "common_alphabet": alphabet, + } + + # Add surrogate type + if surrogate_type: + crypto_replace_ffx_fpe_config["surrogate_info_type"] = {"name": surrogate_type} + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config + } + } + ] + } + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_fpe] + +# [START dlp_deidentify_deterministic] +def deidentify_with_deterministic( + project, + input_str, + info_types, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Deidentifies sensitive data in a string using deterministic encryption. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + surrogate_type: The name of the surrogate custom info type to use. Only + necessary if you want to reverse the deidentification process. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + import base64 + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + wrapped_key = base64.b64decode(wrapped_key) + + # Construct Deterministic encryption configuration dictionary + crypto_replace_deterministic_config = { + "crypto_key": { + "kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name} + }, + } + + # Add surrogate type + if surrogate_type: + crypto_replace_deterministic_config["surrogate_info_type"] = { + "name": surrogate_type + } + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_deterministic_config": crypto_replace_deterministic_config + } + } + ] + } + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_deterministic] + + +# [START dlp_reidentify_fpe] +def reidentify_with_fpe( + project, + input_str, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct Deidentify Config + reidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + ] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + request={ + "parent": parent, + "reidentify_config": reidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_fpe] + + +# [START dlp_reidentify_deterministic] +def reidentify_with_deterministic( + project, + input_str, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Re-identifies content that was previously de-identified through deterministic encryption. + Args: + project: The Google Cloud project ID to use as a parent resource. + input_str: The string to be re-identified. Provide the entire token. Example: + EMAIL_ADDRESS_TOKEN(52):AVAx2eIEnIQP5jbNEr2j9wLOAd5m4kpSBR/0jjjGdAOmryzZbE/q + surrogate_type: The name of the surrogate custom infoType used + during the encryption process. + key_name: The name of the Cloud KMS key used to encrypt ("wrap") the + AES-256 key. Example: + keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ("wrapped") AES-256 key previously used to encrypt the content. + This key must have been encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + import base64 + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + wrapped_key = base64.b64decode(wrapped_key) + + # Construct reidentify Configuration + reidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_deterministic_config": { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + ] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + request={ + "parent": parent, + "reidentify_config": reidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_deterministic] + + +# [START dlp_deidentify_free_text_with_fpe_using_surrogate] +def deidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + info_type: The name of the info type to de-identify + surrogate_type: The name of the surrogate custom info type to use. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct de-identify config + transformation = { + "info_types": [{"name": info_type}], + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": {"unwrapped": {"key": unwrapped_key}}, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + }, + } + + deidentify_config = { + "info_type_transformations": {"transformations": [transformation]} + } + + # Construct the inspect config, trying to finding all PII with likelihood + # higher than UNLIKELY + inspect_config = { + "info_types": [{"name": info_type}], + "min_likelihood": google.cloud.dlp_v2.Likelihood.UNLIKELY, + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_reidentify_free_text_with_fpe_using_surrogate] +def reidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE) with + surrogate type. The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct Deidentify Config + transformation = { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": {"unwrapped": {"key": unwrapped_key}}, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + + reidentify_config = { + "info_type_transformations": {"transformations": [transformation]} + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + request={ + "parent": parent, + "reidentify_config": reidentify_config, + "inspect_config": inspect_config, + "item": item, + } + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_deidentify_date_shift] +def deidentify_with_date_shift( + project, + input_csv_file=None, + output_csv_file=None, + date_fields=None, + lower_bound_days=None, + upper_bound_days=None, + context_field_id=None, + wrapped_key=None, + key_name=None, +): + """Uses the Data Loss Prevention API to deidentify dates in a CSV file by + pseudorandomly shifting them. + Args: + project: The Google Cloud project id to use as a parent resource. + input_csv_file: The path to the CSV file to deidentify. The first row + of the file must specify column names, and all other rows must + contain valid values. + output_csv_file: The path to save the date-shifted CSV file. + date_fields: The list of (date) fields in the CSV file to date shift. + Example: ['birth_date', 'register_date'] + lower_bound_days: The maximum number of days to shift a date backward + upper_bound_days: The maximum number of days to shift a date forward + context_field_id: (Optional) The column to determine date shift amount + based on. If this is not specified, a random shift amount will be + used for every row. If this is specified, then 'wrappedKey' and + 'keyName' must also be set. Example: + contextFieldId = [{ 'name': 'user_id' }] + key_name: (Optional) The name of the Cloud KMS key used to encrypt + ('wrap') the AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. + This key should be encrypted using the Cloud KMS key specified by + key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Convert date field list to Protobuf type + def map_fields(field): + return {"name": field} + + if date_fields: + date_fields = map(map_fields, date_fields) + else: + date_fields = [] + + # Read and parse the CSV file + import csv + from datetime import datetime + + f = [] + with open(input_csv_file, "r") as csvfile: + reader = csv.reader(csvfile) + for row in reader: + f.append(row) + + # Helper function for converting CSV rows to Protobuf types + def map_headers(header): + return {"name": header} + + def map_data(value): + try: + date = datetime.strptime(value, "%m/%d/%Y") + return { + "date_value": {"year": date.year, "month": date.month, "day": date.day} + } + except ValueError: + return {"string_value": value} + + def map_rows(row): + return {"values": map(map_data, row)} + + # Using the helper functions, convert CSV rows to protobuf-compatible + # dictionaries. + csv_headers = map(map_headers, f[0]) + csv_rows = map(map_rows, f[1:]) + + # Construct the table dict + table_item = {"table": {"headers": csv_headers, "rows": csv_rows}} + + # Construct date shift config + date_shift_config = { + "lower_bound_days": lower_bound_days, + "upper_bound_days": upper_bound_days, + } + + # If using a Cloud KMS key, add it to the date_shift_config. + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + if context_field_id and key_name and wrapped_key: + import base64 + + date_shift_config["context"] = {"name": context_field_id} + date_shift_config["crypto_key"] = { + "kms_wrapped": { + "wrapped_key": base64.b64decode(wrapped_key), + "crypto_key_name": key_name, + } + } + elif context_field_id or key_name or wrapped_key: + raise ValueError( + """You must set either ALL or NONE of + [context_field_id, key_name, wrapped_key]!""" + ) + + # Construct Deidentify Config + deidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "fields": date_fields, + "primitive_transformation": { + "date_shift_config": date_shift_config + }, + } + ] + } + } + + # Write to CSV helper methods + def write_header(header): + return header.name + + def write_data(data): + return data.string_value or "%s/%s/%s" % ( + data.date_value.month, + data.date_value.day, + data.date_value.year, + ) + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "item": table_item, + } + ) + + # Write results to CSV file + with open(output_csv_file, "w") as csvfile: + write_file = csv.writer(csvfile, delimiter=",") + write_file.writerow(map(write_header, response.item.table.headers)) + for row in response.item.table.rows: + write_file.writerow(map(write_data, row.values)) + # Print status + print("Successfully saved date-shift output to {}".format(output_csv_file)) + + +# [END dlp_deidentify_date_shift] + + +# [START dlp_deidentify_replace_infotype] +def deidentify_with_replace_infotype(project, item, info_types): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing it with the info type. + Args: + project: The Google Cloud project id to use as a parent resource. + item: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Construct inspect configuration dictionary + inspect_config = {"info_types": [{"name": info_type} for info_type in info_types]} + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + {"primitive_transformation": {"replace_with_info_type_config": {}}} + ] + } + } + + # Call the API + response = dlp.deidentify_content( + request={ + "parent": parent, + "deidentify_config": deidentify_config, + "inspect_config": inspect_config, + "item": {"value": item}, + } + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_replace_infotype] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + mask_parser = subparsers.add_parser( + "deid_mask", + help="Deidentify sensitive data in a string by masking it with a " "character.", + ) + mask_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + mask_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + mask_parser.add_argument("item", help="The string to deidentify.") + mask_parser.add_argument( + "-n", + "--number_to_mask", + type=int, + default=0, + help="The maximum number of sensitive characters to mask in a match. " + "If omitted the request or set to 0, the API will mask any mathcing " + "characters.", + ) + mask_parser.add_argument( + "-m", + "--masking_character", + help="The character to mask matching sensitive data with.", + ) + + replace_parser = subparsers.add_parser( + "deid_replace", + help="Deidentify sensitive data in a string by replacing it with " + "another string.", + ) + replace_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_parser.add_argument("item", help="The string to deidentify.") + replace_parser.add_argument( + "replacement_str", help="The string to " "replace all matched values with." + ) + + fpe_parser = subparsers.add_parser( + "deid_fpe", + help="Deidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + fpe_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + fpe_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + fpe_parser.add_argument( + "item", + help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", + ) + fpe_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + fpe_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + fpe_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + fpe_parser.add_argument( + "-s", + "--surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + + reid_parser = subparsers.add_parser( + "reid_fpe", + help="Reidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + reid_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + reid_parser.add_argument( + "item", + help="The string to deidentify. " "Example: string = 'My SSN is 372819127'", + ) + reid_parser.add_argument( + "surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + reid_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + reid_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + reid_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + + date_shift_parser = subparsers.add_parser( + "deid_date_shift", + help="Deidentify dates in a CSV file by pseudorandomly shifting them.", + ) + date_shift_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + date_shift_parser.add_argument( + "input_csv_file", + help="The path to the CSV file to deidentify. The first row of the " + "file must specify column names, and all other rows must contain " + "valid values.", + ) + date_shift_parser.add_argument( + "output_csv_file", help="The path to save the date-shifted CSV file." + ) + date_shift_parser.add_argument( + "lower_bound_days", + type=int, + help="The maximum number of days to shift a date backward", + ) + date_shift_parser.add_argument( + "upper_bound_days", + type=int, + help="The maximum number of days to shift a date forward", + ) + date_shift_parser.add_argument( + "date_fields", + nargs="+", + help="The list of date fields in the CSV file to date shift. Example: " + "['birth_date', 'register_date']", + ) + date_shift_parser.add_argument( + "--context_field_id", + help="(Optional) The column to determine date shift amount based on. " + "If this is not specified, a random shift amount will be used for " + "every row. If this is specified, then 'wrappedKey' and 'keyName' " + "must also be set.", + ) + date_shift_parser.add_argument( + "--key_name", + help="(Optional) The name of the Cloud KMS key used to encrypt " + "('wrap') the AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + date_shift_parser.add_argument( + "--wrapped_key", + help="(Optional) The encrypted ('wrapped') AES-256 key to use. This " + "key should be encrypted using the Cloud KMS key specified by" + "key_name.", + ) + + replace_with_infotype_parser = subparsers.add_parser( + "replace_with_infotype", + help="Deidentify sensitive data in a string by replacing it with the " + "info type of the data.", + ) + replace_with_infotype_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_with_infotype_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_with_infotype_parser.add_argument( + "item", + help="The string to deidentify." + "Example: 'My credit card is 4242 4242 4242 4242'", + ) + + args = parser.parse_args() + + if args.content == "deid_mask": + deidentify_with_mask( + args.project, + args.item, + args.info_types, + masking_character=args.masking_character, + number_to_mask=args.number_to_mask, + ) + elif args.content == "deid_replace": + deidentify_with_replace( + args.project, + args.item, + args.info_types, + replacement_str=args.replacement_str, + ) + elif args.content == "deid_fpe": + deidentify_with_fpe( + args.project, + args.item, + args.info_types, + alphabet=args.alphabet, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + surrogate_type=args.surrogate_type, + ) + elif args.content == "reid_fpe": + reidentify_with_fpe( + args.project, + args.item, + surrogate_type=args.surrogate_type, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + alphabet=args.alphabet, + ) + elif args.content == "deid_date_shift": + deidentify_with_date_shift( + args.project, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + lower_bound_days=args.lower_bound_days, + upper_bound_days=args.upper_bound_days, + date_fields=args.date_fields, + context_field_id=args.context_field_id, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + ) + elif args.content == "replace_with_infotype": + deidentify_with_replace_infotype( + args.project, + item=args.item, + info_types=args.info_types, + ) diff --git a/dlp/snippets/deid_test.py b/dlp/snippets/deid_test.py new file mode 100644 index 000000000000..d6df2e6bae4a --- /dev/null +++ b/dlp/snippets/deid_test.py @@ -0,0 +1,291 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import google.cloud.dlp_v2 +import pytest + +import deid + +HARMFUL_STRING = "My SSN is 372819127" +HARMLESS_STRING = "My favorite color is blue" +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA==" +WRAPPED_KEY = ( + "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" + "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" + "rotx7Chxz/4z7SIpXFOBY61z0/U=" +) +KEY_NAME = ( + f"projects/{GCLOUD_PROJECT}/locations/global/keyRings/" + "dlp-test/cryptoKeys/dlp-test" +) +SURROGATE_TYPE = "SSN_TOKEN" +CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv") +DATE_SHIFTED_AMOUNT = 30 +DATE_FIELDS = ["birth_date", "register_date"] +CSV_CONTEXT_FIELD = "name" + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_deidentify_with_mask(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert "My SSN is *********" in out + + +def test_deidentify_with_mask_ignore_insensitive_data(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_mask_masking_character_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + masking_character="#", + ) + + out, _ = capsys.readouterr() + assert "My SSN is #########" in out + + +def test_deidentify_with_mask_masking_number_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + number_to_mask=7, + ) + + out, _ = capsys.readouterr() + assert "My SSN is *******27" in out + + +def test_deidentify_with_redact(capsys): + deid.deidentify_with_redact( + GCLOUD_PROJECT, HARMFUL_STRING + "!", ["US_SOCIAL_SECURITY_NUMBER"] + ) + out, _ = capsys.readouterr() + assert "My SSN is !" in out + + +def test_deidentify_with_replace(capsys): + deid.deidentify_with_replace( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + replacement_str="REPLACEMENT_STR", + ) + + out, _ = capsys.readouterr() + assert "My SSN is REPLACEMENT_STR" in out + + +def test_deidentify_with_fpe(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert "My SSN is" in out + assert "372819127" not in out + + +def test_deidentify_with_deterministic(capsys): + deid.deidentify_with_deterministic( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + surrogate_type=SURROGATE_TYPE, + key_name=KEY_NAME, + wrapped_key=WRAPPED_KEY, + ) + + out, _ = capsys.readouterr() + assert "My SSN is" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + surrogate_type=SURROGATE_TYPE, + ) + + out, _ = capsys.readouterr() + assert "My SSN is SSN_TOKEN" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_ignores_insensitive_data(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMLESS_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_date_shift(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + context_field_id=CSV_CONTEXT_FIELD, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_reidentify_with_fpe(capsys): + labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681" + + deid.reidentify_with_fpe( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type=SURROGATE_TYPE, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + ) + + out, _ = capsys.readouterr() + + assert "731997681" not in out + + +def test_reidentify_with_deterministic(capsys): + labeled_fpe_string = "My SSN is SSN_TOKEN(36):ATeRUd3WWnAHHFtjtl1bv+CT09FZ7hyqNas=" + + deid.reidentify_with_deterministic( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type=SURROGATE_TYPE, + key_name=KEY_NAME, + wrapped_key=WRAPPED_KEY, + ) + + out, _ = capsys.readouterr() + + assert "SSN_TOKEN(" not in out + + +def test_deidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is 4359916732" + + deid.deidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" in out + assert "My phone number is" in out + assert "4359916732" not in out + + +def test_reidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398" + + deid.reidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet=google.cloud.dlp_v2.CharsToIgnore.CommonCharsToIgnore.NUMERIC, + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" not in out + assert "9617256398" not in out + assert "My phone number is" in out + + +def test_deidentify_with_replace_infotype(capsys): + url_to_redact = "https://cloud.google.com" + deid.deidentify_with_replace_infotype( + GCLOUD_PROJECT, + "My favorite site is " + url_to_redact, + ["URL"], + ) + + out, _ = capsys.readouterr() + + assert url_to_redact not in out + assert "My favorite site is [URL]" in out diff --git a/dlp/snippets/inspect_content.py b/dlp/snippets/inspect_content.py new file mode 100644 index 000000000000..b8a7d5599fe9 --- /dev/null +++ b/dlp/snippets/inspect_content.py @@ -0,0 +1,1438 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevention API to inspect a string, a +local file or a file on Google Cloud Storage.""" + +from __future__ import print_function + +import argparse +import json +import os + + +# [START dlp_inspect_string_basic] +def inspect_string_basic( + project, + content_string, + info_types=["PHONE_NUMBER"], +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. + inspect_config = { + "info_types": info_types, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print("Quote: {}".format(finding.quote)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string_basic] + + +# [START dlp_inspect_string] +def inspect_string( + project, + content_string, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string] + +# [START dlp_inspect_table] + + +def inspect_table( + project, + data, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + data: Json string representing table data. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + Example: + data = { + "header":[ + "email", + "phone number" + ], + "rows":[ + [ + "robertfrost@xyz.com", + "4232342345" + ], + [ + "johndoe@pqr.com", + "4253458383" + ] + ] + } + + >> $ python inspect_content.py table \ + '{"header": ["email", "phone number"], + "rows": [["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"]]}' + >> Quote: robertfrost@xyz.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + Quote: johndoe@pqr.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in data["header"]] + rows = [] + for row in data["rows"]: + rows.append({"values": [{"string_value": cell_val} for cell_val in row]}) + + table = {} + table["headers"] = headers + table["rows"] = rows + item = {"table": table} + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_table] + +# [START dlp_inspect_file] + + +def inspect_file( + project, + filename, + info_types, + min_likelihood=None, + custom_dictionaries=None, + custom_regexes=None, + max_findings=None, + include_quote=True, + mime_type=None, +): + """Uses the Data Loss Prevention API to analyze a file for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + + import mimetypes + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the item, containing the file's byte data. + with open(filename, mode="rb") as f: + item = {"byte_item": {"type_": content_type_index, "data": f.read()}} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_file] + + +# [START dlp_inspect_gcs] +def inspect_gcs_file( + project, + bucket, + filename, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze a file on GCS. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket containing the file, as a string. + filename: The name of the file in the bucket, including the path, as a + string; e.g. 'images/myfile.png'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the file's URL. + url = "gs://{}/{}".format(bucket, filename) + storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job( + request={"parent": parent, "inspect_job": inspect_job} + ) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_gcs] + + +# [START dlp_inspect_datastore] +def inspect_datastore( + project, + datastore_project, + kind, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + namespace_id=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze Datastore data. + Args: + project: The Google Cloud project id to use as a parent resource. + datastore_project: The Google Cloud project id of the target Datastore. + kind: The kind of the Datastore entity to inspect, e.g. 'Person'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Datastore info. + storage_config = { + "datastore_options": { + "partition_id": { + "project_id": datastore_project, + "namespace_id": namespace_id, + }, + "kind": {"name": kind}, + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job( + request={"parent": parent, "inspect_job": inspect_job} + ) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_datastore] + + +# [START dlp_inspect_bigquery] +def inspect_bigquery( + project, + bigquery_project, + dataset_id, + table_id, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=500, +): + """Uses the Data Loss Prevention API to analyze BigQuery data. + Args: + project: The Google Cloud project id to use as a parent resource. + bigquery_project: The Google Cloud project id of the target table. + dataset_id: The id of the target BigQuery dataset. + table_id: The id of the target BigQuery table. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Bigquery info. + storage_config = { + "big_query_options": { + "table_reference": { + "project_id": bigquery_project, + "dataset_id": dataset_id, + "table_id": table_id, + } + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job( + request={"parent": parent, "inspect_job": inspect_job} + ) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_bigquery] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + parser_string = subparsers.add_parser("string", help="Inspect a string.") + parser_string.add_argument("item", help="The string to inspect.") + parser_string.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_string.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_string.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_string.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_string.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_string.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_string.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_table = subparsers.add_parser("table", help="Inspect a table.") + parser_table.add_argument( + "data", help="Json string representing a table.", type=json.loads + ) + parser_table.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_table.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_table.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_table.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_table.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_table.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_table.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_file = subparsers.add_parser("file", help="Inspect a local file.") + parser_file.add_argument("filename", help="The path to the file to inspect.") + parser_file.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_file.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_file.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_file.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_file.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_file.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_file.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + parser_file.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + parser_gcs = subparsers.add_parser( + "gcs", help="Inspect files on Google Cloud Storage." + ) + parser_gcs.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_gcs.add_argument( + "filename", + help="The name of the file in the bucket, including the path, e.g. " + '"images/myfile.png". Wildcards are permitted.', + ) + parser_gcs.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_gcs.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_gcs.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_gcs.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_gcs.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_gcs.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_gcs.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_gcs.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_gcs.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_datastore = subparsers.add_parser( + "datastore", help="Inspect files on Google Datastore." + ) + parser_datastore.add_argument( + "datastore_project", + help="The Google Cloud project id of the target Datastore.", + ) + parser_datastore.add_argument( + "kind", + help='The kind of the Datastore entity to inspect, e.g. "Person".', + ) + parser_datastore.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_datastore.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_datastore.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_datastore.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_datastore.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_datastore.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_datastore.add_argument( + "--namespace_id", help="The Datastore namespace to use, if applicable." + ) + parser_datastore.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_datastore.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_datastore.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_bigquery = subparsers.add_parser( + "bigquery", help="Inspect files on Google BigQuery." + ) + parser_bigquery.add_argument( + "bigquery_project", + help="The Google Cloud project id of the target table.", + ) + parser_bigquery.add_argument( + "dataset_id", help="The ID of the target BigQuery dataset." + ) + parser_bigquery.add_argument( + "table_id", help="The ID of the target BigQuery table." + ) + parser_bigquery.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_bigquery.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_bigquery.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_bigquery.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_bigquery.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_bigquery.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_bigquery.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_bigquery.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_bigquery.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + args = parser.parse_args() + + if args.content == "string": + inspect_string( + args.project, + args.item, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "table": + inspect_table( + args.project, + args.data, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "file": + inspect_file( + args.project, + args.filename, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + mime_type=args.mime_type, + ) + elif args.content == "gcs": + inspect_gcs_file( + args.project, + args.bucket, + args.filename, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "datastore": + inspect_datastore( + args.project, + args.datastore_project, + args.kind, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + namespace_id=args.namespace_id, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "bigquery": + inspect_bigquery( + args.project, + args.bigquery_project, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) diff --git a/dlp/snippets/inspect_content_test.py b/dlp/snippets/inspect_content_test.py new file mode 100644 index 000000000000..5697439e8b1a --- /dev/null +++ b/dlp/snippets/inspect_content_test.py @@ -0,0 +1,504 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import uuid + +import backoff +import google.api_core.exceptions +from google.api_core.exceptions import ServiceUnavailable +import google.cloud.bigquery +import google.cloud.datastore +import google.cloud.dlp_v2 +import google.cloud.exceptions +import google.cloud.pubsub +import google.cloud.storage +import pytest + +import inspect_content + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +DATASTORE_KIND = "DLP test kind" +DATASTORE_NAME = "DLP test object" + UNIQUE_STRING +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING + +TIMEOUT = 900 # 15 minutes + +DLP_CLIENT = google.cloud.dlp_v2.DlpServiceClient() + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(request={"name": topic_path}) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(request={"topic": topic_path}) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) + try: + subscriber.create_subscription( + request={"name": subscription_path, "topic": topic_path} + ) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(request={"subscription": subscription_path}) + + +@pytest.fixture(scope="module") +def datastore_project(): + # Adds test Datastore data, yields the project ID and then tears down. + datastore_client = google.cloud.datastore.Client() + + kind = DATASTORE_KIND + name = DATASTORE_NAME + key = datastore_client.key(kind, name) + item = google.cloud.datastore.Entity(key=key) + item["payload"] = "My name is Gary Smith and my email is gary@example.com" + + datastore_client.put(item) + + yield GCLOUD_PROJECT + + @backoff.on_exception(backoff.expo, ServiceUnavailable, max_time=120) + def cleanup(): + datastore_client.delete(key) + + cleanup() + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + # DO NOT SUBMIT: trim this down once we find out what works + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + time.sleep(30) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + rows_to_insert = [("Gary Smith", "My email is gary@example.com")] + + bigquery_client.insert_rows(table, rows_to_insert) + + yield GCLOUD_PROJECT + + @backoff.on_exception(backoff.expo, ServiceUnavailable, max_time=120) + def cleanup(): + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + cleanup() + + +def test_inspect_string_basic(capsys): + test_string = "String with a phone number: 234-555-6789" + + inspect_content.inspect_string_basic(GCLOUD_PROJECT, test_string) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Quote: 234-555-6789" in out + + +def test_inspect_string(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: FIRST_NAME" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_table(capsys): + test_tabular_data = { + "header": ["email", "phone number"], + "rows": [ + ["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"], + ], + } + + inspect_content.inspect_table( + GCLOUD_PROJECT, + test_tabular_data, + ["PHONE_NUMBER", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_string_with_custom_info_types(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + dictionaries = ["Gary Smith"] + regexes = ["\\w+@\\w+.com"] + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_string_no_results(capsys): + test_string = "Nothing to see here" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_file_with_custom_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_file_no_results(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_image_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + + +def delete_dlp_job(out): + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_with_custom_info_types( + bucket, topic_id, subscription_id, capsys +): + out = "" + try: + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_no_results(bucket, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "harmless.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + + assert "No findings" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.png", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "*", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore(datastore_project, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore_no_results( + datastore_project, topic_id, subscription_id, capsys +): + out = "" + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["PHONE_NUMBER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) + + +@pytest.mark.skip(reason="Table not found error. Should be inspected.") +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys): + out = "" + try: + inspect_content.inspect_bigquery( + GCLOUD_PROJECT, + bigquery_project, + BIGQUERY_DATASET_ID, + BIGQUERY_TABLE_ID, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=1, + ) + + out, _ = capsys.readouterr() + assert "Inspection operation started" in out + assert "Job name:" in out + finally: + delete_dlp_job(out) diff --git a/dlp/snippets/jobs.py b/dlp/snippets/jobs.py new file mode 100644 index 000000000000..4fcb2d13b4be --- /dev/null +++ b/dlp/snippets/jobs.py @@ -0,0 +1,160 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app to list and delete DLP jobs using the Data Loss Prevent API. """ + +from __future__ import print_function + +import argparse + + +# [START dlp_list_jobs] +def list_dlp_jobs(project, filter_string=None, job_type=None): + """Uses the Data Loss Prevention API to lists DLP jobs that match the + specified filter in the request. + Args: + project: The project id to use as a parent resource. + filter: (Optional) Allows filtering. + Supported syntax: + * Filter expressions are made up of one or more restrictions. + * Restrictions can be combined by 'AND' or 'OR' logical operators. + A sequence of restrictions implicitly uses 'AND'. + * A restriction has the form of ' '. + * Supported fields/values for inspect jobs: + - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED + - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY + - `trigger_name` - The resource name of the trigger that + created job. + * Supported fields for risk analysis jobs: + - `state` - RUNNING|CANCELED|FINISHED|FAILED + * The operator must be '=' or '!='. + Examples: + * inspected_storage = cloud_storage AND state = done + * inspected_storage = cloud_storage OR inspected_storage = bigquery + * inspected_storage = cloud_storage AND + (state = done OR state = canceled) + type: (Optional) The type of job. Defaults to 'INSPECT'. + Choices: + DLP_JOB_TYPE_UNSPECIFIED + INSPECT_JOB: The job inspected content for sensitive data. + RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Job type dictionary + job_type_to_int = { + "DLP_JOB_TYPE_UNSPECIFIED": google.cloud.dlp.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, + "INSPECT_JOB": google.cloud.dlp.DlpJobType.INSPECT_JOB, + "RISK_ANALYSIS_JOB": google.cloud.dlp.DlpJobType.RISK_ANALYSIS_JOB, + } + # If job type is specified, convert job type to number through enums. + if job_type: + job_type = job_type_to_int[job_type] + + # Call the API to get a list of jobs. + response = dlp.list_dlp_jobs( + request={"parent": parent, "filter": filter_string, "type_": job_type} + ) + + # Iterate over results. + for job in response: + print("Job: %s; status: %s" % (job.name, job.state.name)) + + +# [END dlp_list_jobs] + + +# [START dlp_delete_job] +def delete_dlp_job(project, job_name): + """Uses the Data Loss Prevention API to delete a long-running DLP job. + Args: + project: The project id to use as a parent resource. + job_name: The name of the DlpJob resource to be deleted. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id and job name into a full resource id. + name = f"projects/{project}/dlpJobs/{job_name}" + + # Call the API to delete job. + dlp.delete_dlp_job(request={"name": name}) + + print("Successfully deleted %s" % job_name) + + +# [END dlp_delete_job] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + list_parser = subparsers.add_parser( + "list", + help="List Data Loss Prevention API jobs corresponding to a given " "filter.", + ) + list_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + list_parser.add_argument( + "-f", + "--filter", + help="Filter expressions are made up of one or more restrictions.", + ) + list_parser.add_argument( + "-t", + "--type", + choices=["DLP_JOB_TYPE_UNSPECIFIED", "INSPECT_JOB", "RISK_ANALYSIS_JOB"], + help='The type of job. API defaults to "INSPECT"', + ) + + delete_parser = subparsers.add_parser( + "delete", help="Delete results of a Data Loss Prevention API job." + ) + delete_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + delete_parser.add_argument( + "job_name", + help="The name of the DlpJob resource to be deleted. " "Example: X-#####", + ) + + args = parser.parse_args() + + if args.content == "list": + list_dlp_jobs(args.project, filter_string=args.filter, job_type=args.type) + elif args.content == "delete": + delete_dlp_job(args.project, args.job_name) diff --git a/dlp/snippets/jobs_test.py b/dlp/snippets/jobs_test.py new file mode 100644 index 000000000000..22ec36460fce --- /dev/null +++ b/dlp/snippets/jobs_test.py @@ -0,0 +1,91 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import pytest + +import jobs + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_COLUMN_NAME = "zip_code" +TEST_TABLE_PROJECT_ID = "bigquery-public-data" +TEST_DATASET_ID = "san_francisco" +TEST_TABLE_ID = "bikeshare_trips" +test_job_id = "test-job-{}".format(uuid.uuid4()) + + +@pytest.fixture(scope="module") +def test_job_name(): + import google.cloud.dlp + + dlp = google.cloud.dlp_v2.DlpServiceClient() + + parent = f"projects/{GCLOUD_PROJECT}" + + # Construct job request + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}} + }, + "source_table": { + "project_id": TEST_TABLE_PROJECT_ID, + "dataset_id": TEST_DATASET_ID, + "table_id": TEST_TABLE_ID, + }, + } + + response = dlp.create_dlp_job( + request={"parent": parent, "risk_job": risk_job, "job_id": test_job_id} + ) + full_path = response.name + # API expects only job name, not full project path + job_name = full_path[full_path.rfind("/") + 1 :] + yield job_name + + # clean up job if not deleted + try: + dlp.delete_dlp_job(request={"name": full_path}) + except google.api_core.exceptions.NotFound: + print("Issue during teardown, missing job") + + +def test_list_dlp_jobs(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert test_job_name not in out + + +def test_list_dlp_jobs_with_filter(test_job_name, capsys): + jobs.list_dlp_jobs( + GCLOUD_PROJECT, + filter_string="state=RUNNING OR state=DONE", + job_type="RISK_ANALYSIS_JOB", + ) + + out, _ = capsys.readouterr() + assert test_job_name in out + + +def test_list_dlp_jobs_with_job_type(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB") + + out, _ = capsys.readouterr() + assert test_job_name not in out # job created is a risk analysis job + + +def test_delete_dlp_job(test_job_name, capsys): + jobs.delete_dlp_job(GCLOUD_PROJECT, test_job_name) diff --git a/dlp/snippets/metadata.py b/dlp/snippets/metadata.py new file mode 100644 index 000000000000..d5709eeb8156 --- /dev/null +++ b/dlp/snippets/metadata.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_list_info_types] +def list_info_types(language_code=None, result_filter=None): + """List types of sensitive information within a category. + Args: + language_code: The BCP-47 language code to use, e.g. 'en-US'. + filter: An optional filter to only return info types supported by + certain parts of the API. Defaults to "supported_by=INSPECT". + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Make the API call. + response = dlp.list_info_types( + request={"parent": language_code, "filter": result_filter} + ) + + # Print the results to the console. + print("Info types:") + for info_type in response.info_types: + print( + "{name}: {display_name}".format( + name=info_type.name, display_name=info_type.display_name + ) + ) + + +# [END dlp_list_info_types] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--language_code", + help="The BCP-47 language code to use, e.g. 'en-US'.", + ) + parser.add_argument( + "--filter", + help="An optional filter to only return info types supported by " + 'certain parts of the API. Defaults to "supported_by=INSPECT".', + ) + + args = parser.parse_args() + + list_info_types(language_code=args.language_code, result_filter=args.filter) diff --git a/dlp/snippets/metadata_test.py b/dlp/snippets/metadata_test.py new file mode 100644 index 000000000000..c06440cd3cb0 --- /dev/null +++ b/dlp/snippets/metadata_test.py @@ -0,0 +1,22 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import metadata + + +def test_fetch_info_types(capsys): + metadata.list_info_types() + + out, _ = capsys.readouterr() + assert "EMAIL_ADDRESS" in out diff --git a/dlp/snippets/noxfile_config.py b/dlp/snippets/noxfile_config.py new file mode 100644 index 000000000000..1c2d85d16597 --- /dev/null +++ b/dlp/snippets/noxfile_config.py @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default TEST_CONFIG_OVERRIDE for python repos. + +# You can copy this file into your directory, then it will be imported from +# the noxfile.py. + +# The source of truth: +# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/main/noxfile_config.py + +TEST_CONFIG_OVERRIDE = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.6"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": False, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # "gcloud_project_env": "BUILD_SPECIFIC_GCLOUD_PROJECT", + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} diff --git a/dlp/snippets/quickstart.py b/dlp/snippets/quickstart.py new file mode 100644 index 000000000000..090b5bcc6324 --- /dev/null +++ b/dlp/snippets/quickstart.py @@ -0,0 +1,92 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse +import sys + + +def quickstart(project_id): + """Demonstrates use of the Data Loss Prevention API client library.""" + + # [START dlp_quickstart] + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp_client = google.cloud.dlp_v2.DlpServiceClient() + + # The string to inspect + content = "Robert Frost" + + # Construct the item to inspect. + item = {"value": content} + + # The info types to search for in the content. Required. + info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}] + + # The minimum likelihood to constitute a match. Optional. + min_likelihood = google.cloud.dlp_v2.Likelihood.LIKELIHOOD_UNSPECIFIED + + # The maximum number of findings to report (0 = server maximum). Optional. + max_findings = 0 + + # Whether to include the matching string in the results. Optional. + include_quote = True + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Convert the project id into a full resource id. + parent = f"projects/{project_id}" + + # Call the API. + response = dlp_client.inspect_content( + request={"parent": parent, "inspect_config": inspect_config, "item": item} + ) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + # Convert likelihood value to string respresentation. + likelihood = finding.likelihood.name + print("Likelihood: {}".format(likelihood)) + else: + print("No findings.") + # [END dlp_quickstart] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("project_id", help="Enter your GCP project id.", type=str) + args = parser.parse_args() + if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + quickstart(args.project_id) diff --git a/dlp/snippets/quickstart_test.py b/dlp/snippets/quickstart_test.py new file mode 100644 index 000000000000..dc9f91a583d4 --- /dev/null +++ b/dlp/snippets/quickstart_test.py @@ -0,0 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import quickstart + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_quickstart(capsys): + quickstart.quickstart(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert "FIRST_NAME" in out + assert "LAST_NAME" in out diff --git a/dlp/snippets/redact.py b/dlp/snippets/redact.py new file mode 100644 index 000000000000..09713c7217c6 --- /dev/null +++ b/dlp/snippets/redact.py @@ -0,0 +1,259 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to redact the contents of +an image file.""" + +from __future__ import print_function + +import argparse + +# [START dlp_redact_image] +import mimetypes + +# [END dlp_redact_image] +import os + +# [START dlp_redact_image] + + +def redact_image( + project, + filename, + output_filename, + info_types, + min_likelihood=None, + mime_type=None, +): + """Uses the Data Loss Prevention API to redact protected data in an image. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare image_redaction_configs, a list of dictionaries. Each dictionary + # contains an info_type and optionally the color used for the replacement. + # The color is omitted in this sample, so the default (black) will be used. + image_redaction_configs = [] + + if info_types is not None: + for info_type in info_types: + image_redaction_configs.append({"info_type": info_type}) + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "min_likelihood": min_likelihood, + "info_types": info_types, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] or "application/octet-stream" + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type_": content_type_index, "data": f.read()} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.redact_image( + request={ + "parent": parent, + "inspect_config": inspect_config, + "image_redaction_configs": image_redaction_configs, + "byte_item": byte_item, + } + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + print( + "Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename + ) + ) + + +# [END dlp_redact_image] + +# [START dlp_redact_image_all_text] + + +def redact_image_all_text( + project, + filename, + output_filename, +): + """Uses the Data Loss Prevention API to redact all text in an image. + + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the image_redaction_configs, indicating to DLP that all text in + # the input image should be redacted. + image_redaction_configs = [{"redact_all_text": True}] + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type_": google.cloud.dlp_v2.FileType.IMAGE, "data": f.read()} + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.redact_image( + request={ + "parent": parent, + "image_redaction_configs": image_redaction_configs, + "byte_item": byte_item, + } + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + + print( + "Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename + ) + ) + + +# [END dlp_redact_image_all_text] + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + common_args_parser = argparse.ArgumentParser(add_help=False) + common_args_parser.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + common_args_parser.add_argument("filename", help="The path to the file to inspect.") + common_args_parser.add_argument( + "output_filename", + help="The path to which the redacted image will be written.", + ) + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select which content should be redacted." + ) + subparsers.required = True + + info_types_parser = subparsers.add_parser( + "info_types", + help="Redact specific infoTypes from an image.", + parents=[common_args_parser], + ) + info_types_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + info_types_parser.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + info_types_parser.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + all_text_parser = subparsers.add_parser( + "all_text", + help="Redact all text from an image. The MIME type of the file is " + "inferred via the Python standard library's mimetypes module.", + parents=[common_args_parser], + ) + + args = parser.parse_args() + + if args.content == "info_types": + redact_image( + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) + elif args.content == "all_text": + redact_image_all_text( + args.project, + args.filename, + args.output_filename, + ) diff --git a/dlp/snippets/redact_test.py b/dlp/snippets/redact_test.py new file mode 100644 index 000000000000..24ade2125456 --- /dev/null +++ b/dlp/snippets/redact_test.py @@ -0,0 +1,60 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import redact + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_redact_image_file(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + ) + + out, _ = capsys.readouterr() + assert output_filepath in out + + +def test_redact_image_all_text(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image_all_text( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ) + + out, _ = capsys.readouterr() + assert output_filepath in out diff --git a/dlp/snippets/requirements-test.txt b/dlp/snippets/requirements-test.txt new file mode 100644 index 000000000000..3275b420e033 --- /dev/null +++ b/dlp/snippets/requirements-test.txt @@ -0,0 +1,4 @@ +backoff==2.2.1 +pytest==7.2.1 +flaky==3.7.0 +mock==5.0.1 diff --git a/dlp/snippets/requirements.txt b/dlp/snippets/requirements.txt new file mode 100644 index 000000000000..a8368832f620 --- /dev/null +++ b/dlp/snippets/requirements.txt @@ -0,0 +1,5 @@ +google-cloud-dlp==3.11.1 +google-cloud-storage==2.7.0 +google-cloud-pubsub==2.14.1 +google-cloud-datastore==2.13.2 +google-cloud-bigquery==3.6.0 diff --git a/dlp/snippets/resources/accounts.txt b/dlp/snippets/resources/accounts.txt new file mode 100644 index 000000000000..2763cd0ab820 --- /dev/null +++ b/dlp/snippets/resources/accounts.txt @@ -0,0 +1 @@ +My credit card number is 1234 5678 9012 3456, and my CVV is 789. \ No newline at end of file diff --git a/dlp/snippets/resources/dates.csv b/dlp/snippets/resources/dates.csv new file mode 100644 index 000000000000..056fccb328ea --- /dev/null +++ b/dlp/snippets/resources/dates.csv @@ -0,0 +1,5 @@ +name,birth_date,register_date,credit_card +Ann,01/01/1970,07/21/1996,4532908762519852 +James,03/06/1988,04/09/2001,4301261899725540 +Dan,08/14/1945,11/15/2011,4620761856015295 +Laura,11/03/1992,01/04/2017,4564981067258901 \ No newline at end of file diff --git a/dlp/snippets/resources/harmless.txt b/dlp/snippets/resources/harmless.txt new file mode 100644 index 000000000000..5666de37ab23 --- /dev/null +++ b/dlp/snippets/resources/harmless.txt @@ -0,0 +1 @@ +This file is mostly harmless. diff --git a/dlp/snippets/resources/test.png b/dlp/snippets/resources/test.png new file mode 100644 index 000000000000..8f32c8258842 Binary files /dev/null and b/dlp/snippets/resources/test.png differ diff --git a/dlp/snippets/resources/test.txt b/dlp/snippets/resources/test.txt new file mode 100644 index 000000000000..c2ee3815bc9b --- /dev/null +++ b/dlp/snippets/resources/test.txt @@ -0,0 +1 @@ +My phone number is (223) 456-7890 and my email address is gary@somedomain.com. \ No newline at end of file diff --git a/dlp/snippets/risk.py b/dlp/snippets/risk.py new file mode 100644 index 000000000000..065cdc6bf236 --- /dev/null +++ b/dlp/snippets/risk.py @@ -0,0 +1,939 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to perform risk anaylsis.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_numerical_stats] +def numerical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of numerical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + import concurrent.futures + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": {"numerical_stats_config": {"field": {"name": column_name}}}, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + results = job.risk_details.numerical_stats_result + print( + "Value Range: [{}, {}]".format( + results.min_value.integer_value, + results.max_value.integer_value, + ) + ) + prev_value = None + for percent, result in enumerate(results.quantile_values): + value = result.integer_value + if prev_value != value: + print("Value at {}% quantile: {}".format(percent, value)) + prev_value = value + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except concurrent.futures.TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_numerical_stats] + + +# [START dlp_categorical_stats] +def categorical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of categorical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + import concurrent.futures + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": column_name}} + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + histogram_buckets = ( + job.risk_details.categorical_stats_result.value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Most common value occurs {} time(s)".format( + bucket.value_frequency_upper_bound + ) + ) + print( + " Least common value occurs {} time(s)".format( + bucket.value_frequency_lower_bound + ) + ) + print(" {} unique values total.".format(bucket.bucket_size)) + for value in bucket.bucket_values: + print( + " Value {} occurs {} time(s)".format( + value.value.integer_value, value.count + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except concurrent.futures.TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_categorical_stats] + + +# [START dlp_k_anonymity] +def k_anonymity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-anonymity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + import concurrent.futures + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}}, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + histogram_buckets = ( + job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + if bucket.equivalence_class_size_lower_bound: + print( + " Bucket size range: [{}, {}]".format( + bucket.equivalence_class_size_lower_bound, + bucket.equivalence_class_size_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except concurrent.futures.TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_anonymity] + + +# [START dlp_l_diversity] +def l_diversity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + sensitive_attribute, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the l-diversity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + sensitive_attribute: The column to measure l-diversity relative to. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + import concurrent.futures + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "l_diversity_config": { + "quasi_ids": quasi_ids, + "sensitive_attribute": {"name": sensitive_attribute}, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + histogram_buckets = ( + job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Bucket size range: [{}, {}]".format( + bucket.sensitive_value_frequency_lower_bound, + bucket.sensitive_value_frequency_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format(value_bucket.equivalence_class_size) + ) + for value in value_bucket.top_sensitive_values: + print( + ( + " Sensitive value {} occurs {} time(s)".format( + value.value, value.count + ) + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except concurrent.futures.TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_l_diversity] + + +# [START dlp_k_map] +def k_map_estimate_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + info_types, + region_code="US", + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-map risk estimation + of a column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key and optionally + their reidentification distributions. + info_types: Type of information of the quasi_id in order to provide a + statistical model of population. + region_code: The ISO 3166-1 region code that the data is representative + of. Can be omitted if using a region-specific infoType (such as + US_ZIP_5) + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + import concurrent.futures + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = f"projects/{project}/locations/global" + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Check that numbers of quasi-ids and info types are equal + if len(quasi_ids) != len(info_types): + raise ValueError( + """Number of infoTypes and number of quasi-identifiers + must be equal!""" + ) + + # Convert quasi id list to Protobuf type + def map_fields(quasi_id, info_type): + return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} + + quasi_ids = map(map_fields, quasi_ids, info_types) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "k_map_estimation_config": { + "quasi_ids": quasi_ids, + "region_code": region_code, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(request={"parent": parent, "risk_job": risk_job}) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(request={"name": operation.name}) + print(f"Job name: {job.name}") + histogram_buckets = ( + job.risk_details.k_map_estimation_result.k_map_estimation_histogram + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Anonymity range: [{}, {}]".format( + bucket.min_anonymity, bucket.max_anonymity + ) + ) + print(" Size: {}".format(bucket.bucket_size)) + for value_bucket in bucket.bucket_values: + print( + " Values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Estimated k-map anonymity: {}".format( + value_bucket.estimated_anonymity + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except concurrent.futures.TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_map] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + numerical_parser = subparsers.add_parser("numerical", help="") + numerical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + numerical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + numerical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + numerical_parser.add_argument("table_id", help="The id of the table to inspect.") + numerical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + numerical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + numerical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + numerical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + categorical_parser = subparsers.add_parser("categorical", help="") + categorical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + categorical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + categorical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + categorical_parser.add_argument("table_id", help="The id of the table to inspect.") + categorical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + categorical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + categorical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + categorical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_anonymity_parser = subparsers.add_parser( + "k_anonymity", + help="Computes the k-anonymity of a column set in a Google BigQuery" "table.", + ) + k_anonymity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_anonymity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_anonymity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + k_anonymity_parser.add_argument("table_id", help="The id of the table to inspect.") + k_anonymity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_anonymity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_anonymity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_anonymity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + l_diversity_parser = subparsers.add_parser( + "l_diversity", + help="Computes the l-diversity of a column set in a Google BigQuery" "table.", + ) + l_diversity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + l_diversity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + l_diversity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + l_diversity_parser.add_argument("table_id", help="The id of the table to inspect.") + l_diversity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + l_diversity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + l_diversity_parser.add_argument( + "sensitive_attribute", + help="The column to measure l-diversity relative to.", + ) + l_diversity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + l_diversity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_map_parser = subparsers.add_parser( + "k_map", + help="Computes the k-map risk estimation of a column set in a Google" + "BigQuery table.", + ) + k_map_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_map_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_map_parser.add_argument("dataset_id", help="The id of the dataset to inspect.") + k_map_parser.add_argument("table_id", help="The id of the table to inspect.") + k_map_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_map_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_map_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_map_parser.add_argument( + "-t", + "--info-types", + nargs="+", + help="Type of information of the quasi_id in order to provide a" + "statistical model of population.", + required=True, + ) + k_map_parser.add_argument( + "-r", + "--region-code", + default="US", + help="The ISO 3166-1 region code that the data is representative of.", + ) + k_map_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + args = parser.parse_args() + + if args.content == "numerical": + numerical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "categorical": + categorical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "k_anonymity": + k_anonymity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "l_diversity": + l_diversity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.sensitive_attribute, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "k_map": + k_map_estimate_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + args.info_types, + region_code=args.region_code, + timeout=args.timeout, + ) diff --git a/dlp/snippets/risk_test.py b/dlp/snippets/risk_test.py new file mode 100644 index 000000000000..cbc596122743 --- /dev/null +++ b/dlp/snippets/risk_test.py @@ -0,0 +1,398 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.cloud.bigquery +import google.cloud.dlp_v2 +import google.cloud.pubsub +import pytest + +import risk + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TABLE_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +UNIQUE_FIELD = "Name" +REPEATED_FIELD = "Mystery" +NUMERIC_FIELD = "Age" +STRING_BOOLEAN_FIELD = "Gender" + +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING +BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING +DLP_CLIENT = google.cloud.dlp_v2.DlpServiceClient() + + +# Create new custom topic/subscription +# We observe sometimes all the tests in this file fail. In a +# hypothesis where DLP service somehow loses the connection to the +# topic, now we use function scope for Pub/Sub fixtures. +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(request={"name": topic_path}) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(request={"topic": topic_path}) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path(GCLOUD_PROJECT, SUBSCRIPTION_ID) + try: + subscriber.create_subscription( + request={"name": subscription_path, "topic": topic_path} + ) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(request={"subscription": subscription_path}) + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + harmful_table_ref = dataset_ref.table(BIGQUERY_HARMFUL_TABLE_ID) + harmful_table = google.cloud.bigquery.Table(harmful_table_ref) + + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + harmful_table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField("TelephoneNumber", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField("Mystery", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField("Age", "INTEGER", "REQUIRED"), + google.cloud.bigquery.SchemaField("Gender", "STRING"), + google.cloud.bigquery.SchemaField("RegionCode", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + try: + harmful_table = bigquery_client.create_table(harmful_table) + except google.api_core.exceptions.Conflict: + harmful_table = bigquery_client.get_table(harmful_table) + + rows_to_insert = [("Gary Smith", "My email is gary@example.com")] + harmful_rows_to_insert = [ + ( + "Gandalf", + "(123) 456-7890", + "4231 5555 6781 9876", + 27, + "Male", + "US", + ), + ( + "Dumbledore", + "(313) 337-1337", + "6291 8765 1095 7629", + 27, + "Male", + "US", + ), + ("Joe", "(452) 123-1234", "3782 2288 1166 3030", 35, "Male", "US"), + ("James", "(567) 890-1234", "8291 3627 8250 1234", 19, "Male", "US"), + ( + "Marie", + "(452) 123-1234", + "8291 3627 8250 1234", + 35, + "Female", + "US", + ), + ( + "Carrie", + "(567) 890-1234", + "2253 5218 4251 4526", + 35, + "Female", + "US", + ), + ] + + bigquery_client.insert_rows(table, rows_to_insert) + bigquery_client.insert_rows(harmful_table, harmful_rows_to_insert) + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_numerical_risk_analysis(topic_id, subscription_id, bigquery_project, capsys): + risk.numerical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + ) + + out, _ = capsys.readouterr() + assert "Value Range:" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_string_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + UNIQUE_FIELD, + topic_id, + subscription_id, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_number_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_multiple_fields( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, REPEATED_FIELD], + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD], + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD, REPEATED_FIELD], + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + ["AGE"], + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=5, min_passes=1) +def test_k_map_estimate_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE", "GENDER"], + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + assert "Job name:" in out + for line in str(out).split("\n"): + if "Job name" in line: + job_name = line.split(":")[1].strip() + DLP_CLIENT.delete_dlp_job(name=job_name) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_quasi_ids_info_types_equal( + topic_id, subscription_id, bigquery_project +): + with pytest.raises(ValueError): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE"], + ) diff --git a/dlp/snippets/templates.py b/dlp/snippets/templates.py new file mode 100644 index 000000000000..6c618a0a7493 --- /dev/null +++ b/dlp/snippets/templates.py @@ -0,0 +1,255 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API inspect templates.""" + +from __future__ import print_function + +import argparse +import os + + +# [START dlp_create_inspect_template] +def create_inspect_template( + project, + info_types, + template_id=None, + display_name=None, + min_likelihood=None, + max_findings=None, + include_quote=None, +): + """Creates a Data Loss Prevention API inspect template. + Args: + project: The Google Cloud project id to use as a parent resource. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + template_id: The id of the template. If omitted, an id will be randomly + generated. + display_name: The optional display name of the template. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + inspect_template = { + "inspect_config": inspect_config, + "display_name": display_name, + } + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.create_inspect_template( + request={ + "parent": parent, + "inspect_template": inspect_template, + "template_id": template_id, + } + ) + + print("Successfully created template {}".format(response.name)) + + +# [END dlp_create_inspect_template] + + +# [START dlp_list_templates] +def list_inspect_templates(project): + """Lists all Data Loss Prevention API inspect templates. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.list_inspect_templates(request={"parent": parent}) + + for template in response: + print("Template {}:".format(template.name)) + if template.display_name: + print(" Display Name: {}".format(template.display_name)) + print(" Created: {}".format(template.create_time)) + print(" Updated: {}".format(template.update_time)) + + config = template.inspect_config + print( + " InfoTypes: {}".format(", ".join([it.name for it in config.info_types])) + ) + print(" Minimum likelihood: {}".format(config.min_likelihood)) + print(" Include quotes: {}".format(config.include_quote)) + print( + " Max findings per request: {}".format( + config.limits.max_findings_per_request + ) + ) + + +# [END dlp_list_templates] + + +# [START dlp_delete_inspect_template] +def delete_inspect_template(project, template_id): + """Deletes a Data Loss Prevention API template. + Args: + project: The id of the Google Cloud project which owns the template. + template_id: The id of the template to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Combine the template id with the parent id. + template_resource = "{}/inspectTemplates/{}".format(parent, template_id) + + # Call the API. + dlp.delete_inspect_template(request={"name": template_resource}) + + print("Template {} successfully deleted.".format(template_resource)) + + +# [END dlp_delete_inspect_template] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a template.") + parser_create.add_argument( + "--template_id", + help="The id of the template. If omitted, an id will be randomly " "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the template." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_list = subparsers.add_parser("list", help="List all templates.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a template.") + parser_delete.add_argument("template_id", help="The id of the template to delete.") + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_inspect_template( + args.project, + args.info_types, + template_id=args.template_id, + display_name=args.display_name, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.action == "list": + list_inspect_templates(args.project) + elif args.action == "delete": + delete_inspect_template(args.project, args.template_id) diff --git a/dlp/snippets/templates_test.py b/dlp/snippets/templates_test.py new file mode 100644 index 000000000000..4682f47cbd0b --- /dev/null +++ b/dlp/snippets/templates_test.py @@ -0,0 +1,60 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage + +import templates + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_TEMPLATE_ID = "test-template" + UNIQUE_STRING + + +def test_create_list_and_delete_template(capsys): + try: + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Template already exists, perhaps due to a previous interrupted test. + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + # Try again and move on. + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.list_inspect_templates(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out diff --git a/dlp/snippets/triggers.py b/dlp/snippets/triggers.py new file mode 100644 index 000000000000..11acd6546f29 --- /dev/null +++ b/dlp/snippets/triggers.py @@ -0,0 +1,286 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API automation triggers.""" + +from __future__ import print_function + +import argparse +import os + + +# [START dlp_create_trigger] +def create_trigger( + project, + bucket, + scan_period_days, + info_types, + trigger_id=None, + display_name=None, + description=None, + min_likelihood=None, + max_findings=None, + auto_populate_timespan=False, +): + """Creates a scheduled Data Loss Prevention API inspect_content trigger. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket to scan. This sample scans all + files in the bucket using a wildcard. + scan_period_days: How often to repeat the scan, in days. + The minimum is 1 day. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + trigger_id: The id of the trigger. If omitted, an id will be randomly + generated. + display_name: The optional display name of the trigger. + description: The optional description of the trigger. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + auto_populate_timespan: Automatically populates time span config start + and end times in order to scan new content only. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a cloud_storage_options dictionary with the bucket's URL. + url = "gs://{}/*".format(bucket) + storage_config = { + "cloud_storage_options": {"file_set": {"url": url}}, + # Time-based configuration for each storage object. + "timespan_config": { + # Auto-populate start and end times in order to scan new objects + # only. + "enable_auto_population_of_timespan_config": auto_populate_timespan + }, + } + + # Construct the job definition. + job = {"inspect_config": inspect_config, "storage_config": storage_config} + + # Construct the schedule definition: + schedule = { + "recurrence_period_duration": {"seconds": scan_period_days * 60 * 60 * 24} + } + + # Construct the trigger definition. + job_trigger = { + "inspect_job": job, + "display_name": display_name, + "description": description, + "triggers": [{"schedule": schedule}], + "status": google.cloud.dlp_v2.JobTrigger.Status.HEALTHY, + } + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.create_job_trigger( + request={"parent": parent, "job_trigger": job_trigger, "trigger_id": trigger_id} + ) + + print("Successfully created trigger {}".format(response.name)) + + +# [END dlp_create_trigger] + + +# [START dlp_list_triggers] +def list_triggers(project): + """Lists all Data Loss Prevention API triggers. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Call the API. + response = dlp.list_job_triggers(request={"parent": parent}) + + for trigger in response: + print("Trigger {}:".format(trigger.name)) + print(" Created: {}".format(trigger.create_time)) + print(" Updated: {}".format(trigger.update_time)) + if trigger.display_name: + print(" Display Name: {}".format(trigger.display_name)) + if trigger.description: + print(" Description: {}".format(trigger.discription)) + print(" Status: {}".format(trigger.status)) + print(" Error count: {}".format(len(trigger.errors))) + + +# [END dlp_list_triggers] + + +# [START dlp_delete_trigger] +def delete_trigger(project, trigger_id): + """Deletes a Data Loss Prevention API trigger. + Args: + project: The id of the Google Cloud project which owns the trigger. + trigger_id: The id of the trigger to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = f"projects/{project}" + + # Combine the trigger id with the parent id. + trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id) + + # Call the API. + dlp.delete_job_trigger(request={"name": trigger_resource}) + + print("Trigger {} successfully deleted.".format(trigger_resource)) + + +# [END dlp_delete_trigger] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a trigger.") + parser_create.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_create.add_argument( + "scan_period_days", + type=int, + help="How often to repeat the scan, in days. The minimum is 1 day.", + ) + parser_create.add_argument( + "--trigger_id", + help="The id of the trigger. If omitted, an id will be randomly " "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the trigger." + ) + parser_create.add_argument( + "--description", help="The optional description of the trigger." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--auto_populate_timespan", + type=bool, + help="Limit scan to new content only.", + ) + + parser_list = subparsers.add_parser("list", help="List all triggers.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a trigger.") + parser_delete.add_argument("trigger_id", help="The id of the trigger to delete.") + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_trigger( + args.project, + args.bucket, + args.scan_period_days, + args.info_types, + trigger_id=args.trigger_id, + display_name=args.display_name, + description=args.description, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + auto_populate_timespan=args.auto_populate_timespan, + ) + elif args.action == "list": + list_triggers(args.project) + elif args.action == "delete": + delete_trigger(args.project, args.trigger_id) diff --git a/dlp/snippets/triggers_test.py b/dlp/snippets/triggers_test.py new file mode 100644 index 000000000000..8bd73db2f959 --- /dev/null +++ b/dlp/snippets/triggers_test.py @@ -0,0 +1,102 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage +import pytest + +import triggers + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TEST_TRIGGER_ID = "test-trigger" + UNIQUE_STRING + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +def test_create_list_and_delete_trigger(bucket, capsys): + try: + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Trigger already exists, perhaps due to a previous interrupted test. + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + # Try again and move on. + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + auto_populate_timespan=True, + ) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.list_triggers(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out