Skip to content

Commit

Permalink
MINOR - PII Scanner tests and log levels (#17686)
Browse files Browse the repository at this point in the history
* MINOR - PII Scanner tests and log levels

* MINOR - PII Scanner tests and log levels
  • Loading branch information
pmbrull committed Sep 4, 2024
1 parent 847b226 commit 75be6b6
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def patch( # pylint: disable=too-many-arguments

except Exception as exc:
logger.debug(traceback.format_exc())
logger.error(f"Error trying to PATCH {get_log_name(source)}: {exc}")
logger.warning(f"Error trying to PATCH {get_log_name(source)}: {exc}")

return None

Expand Down
8 changes: 3 additions & 5 deletions ingestion/src/metadata/ingestion/sink/metadata_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,11 +586,9 @@ def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]:
table=record.table, column_tags=record.column_tags
)
if not patched:
self.status.failed(
StackTraceError(
name=table.fullyQualifiedName.root,
error="Error patching tags for table",
)
self.status.warning(
key=table.fullyQualifiedName.root,
reason="Error patching tags for table",
)
else:
logger.debug(
Expand Down
14 changes: 10 additions & 4 deletions ingestion/src/metadata/profiler/source/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
OpenMetadata source for the profiler
"""
import traceback
from typing import Iterable, Optional, cast
from typing import Iterable, List, Optional, cast

from pydantic import BaseModel, ConfigDict

Expand Down Expand Up @@ -265,6 +265,14 @@ def get_database_entities(self):

return databases

def _get_fields(self) -> List[str]:
"""Get the fields required to process the tables"""
return (
TABLE_FIELDS
if not self.source_config.processPiiSensitive
else TABLE_FIELDS + TAGS_FIELD
)

def get_table_entities(self, database):
"""
List and filter OpenMetadata tables based on the
Expand All @@ -282,9 +290,7 @@ def get_table_entities(self, database):
"""
tables = self.metadata.list_all_entities(
entity=Table,
fields=TABLE_FIELDS
if not self.source_config.processPiiSensitive
else TABLE_FIELDS + TAGS_FIELD,
fields=self._get_fields(),
params={
"service": self.config.source.serviceName,
"database": fqn.build(
Expand Down
36 changes: 1 addition & 35 deletions ingestion/src/metadata/profiler/source/metadata_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from sqlalchemy.inspection import inspect

from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.table import Table
from metadata.generated.schema.entity.services.ingestionPipelines.status import (
StackTraceError,
)
Expand Down Expand Up @@ -138,7 +137,7 @@ def _iter(self, *_, **__) -> Iterable[Either[ProfilerSourceAndEntity]]:
service_name=None,
schema_name=schema_name,
table_name=table_name,
fields="tableProfilerConfig",
fields=",".join(self._get_fields()),
)
if not table_entity:
logger.debug(
Expand Down Expand Up @@ -226,36 +225,3 @@ def get_database_names(self) -> Iterable[str]:
except Exception as exc:
logger.debug(f"Failed to fetch database names {exc}")
logger.debug(traceback.format_exc())

def get_table_entities(self, database):
"""
List and filter OpenMetadata tables based on the
source configuration.
The listing will be based on the entities from the
informed service name in the source configuration.
Note that users can specify `table_filter_pattern` to
either be `includes` or `excludes`. This means
that we will either what is specified in `includes`
or we will use everything but the tables excluded.
Same with `schema_filter_pattern`.
"""
tables = self.metadata.list_all_entities(
entity=Table,
fields=[
"tableProfilerConfig",
],
params={
"service": self.config.source.serviceName,
"database": fqn.build(
self.metadata,
entity_type=Database,
service_name=self.config.source.serviceName,
database_name=database.name.root,
),
}, # type: ignore
)

yield from self.filter_entities(tables)
4 changes: 2 additions & 2 deletions ingestion/src/metadata/workflow/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def _get_source_class(self):
if self.config.source.serviceName:
return OpenMetadataSource
logger.info(
"Database Service name not provided, we will scan all the tables"
"available within data source and locate table entity in OpenMetadata"
"Database Service name not provided, we will scan all the tables "
"available within data source and locate table entity in OpenMetadata "
"to ingest profiler data."
)
return OpenMetadataSourceExt
Expand Down
100 changes: 58 additions & 42 deletions ingestion/tests/unit/pii/test_ner_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,54 +11,70 @@
"""
Test Column Name Scanner
"""
from unittest import TestCase

from metadata.pii.scanners.ner_scanner import NERScanner
import pytest

from metadata.pii.scanners.ner_scanner import NERScanner, StringAnalysis

class NERScannerTest(TestCase):
"""
Validate various typical column names
"""

ner_scanner = NERScanner()
@pytest.fixture
def scanner() -> NERScanner:
"""Return the scanner"""
return NERScanner()

def test_scanner_none(self):
self.assertIsNone(self.ner_scanner.scan(list(range(100))))
self.assertIsNone(
self.ner_scanner.scan(
" ".split(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
)

def test_scanner_none(scanner):
assert scanner.scan(list(range(100))) is None
assert (
scanner.scan(
" ".split(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
)
)
) is None

def test_scanner_sensitive(self):
self.assertEqual(
self.ner_scanner.scan(
[
"[email protected]",
"[email protected]",
"[email protected]",
]
).tag_fqn,
"PII.Sensitive",
)
self.assertEqual(
self.ner_scanner.scan(
["im ok", "[email protected]", "not sensitive"]
).tag_fqn,
"PII.Sensitive",
)

def test_scanner_nonsensitive(self):
self.assertEqual(
self.ner_scanner.scan(
[
"Washington",
"Alaska",
"Netherfield Lea Street",
]
).tag_fqn,
"PII.NonSensitive",
)
def test_scanner_sensitive(scanner):
assert (
scanner.scan(
[
"[email protected]",
"[email protected]",
"[email protected]",
]
).tag_fqn
== "PII.Sensitive"
)
assert (
scanner.scan(["im ok", "[email protected]", "not sensitive"]).tag_fqn
== "PII.Sensitive"
)


def test_scanner_nonsensitive(scanner):
assert (
scanner.scan(
[
"Washington",
"Alaska",
"Netherfield Lea Street",
]
).tag_fqn
== "PII.NonSensitive"
)


def test_get_highest_score_label(scanner):
"""Validate that even with score clashes, we only get one result back"""
assert scanner.get_highest_score_label(
{
"PII.Sensitive": StringAnalysis(score=0.9, appearances=1),
"PII.NonSensitive": StringAnalysis(score=0.8, appearances=1),
}
) == ("PII.Sensitive", 0.9)
assert scanner.get_highest_score_label(
{
"PII.Sensitive": StringAnalysis(score=1.0, appearances=1),
"PII.NonSensitive": StringAnalysis(score=1.0, appearances=1),
}
) == ("PII.Sensitive", 1.0)

0 comments on commit 75be6b6

Please sign in to comment.