MINOR - PII Scanner tests and log levels (#17686)

* MINOR - PII Scanner tests and log levels * MINOR - PII Scanner tests and log levels
open-metadata · Sep 4, 2024 · 75be6b6 · 75be6b6
1 parent 847b226
commit 75be6b6
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 89 deletions.
diff --git a/ingestion/src/metadata/ingestion/ometa/mixins/patch_mixin.py b/ingestion/src/metadata/ingestion/ometa/mixins/patch_mixin.py
@@ -157,7 +157,7 @@ def patch(  # pylint: disable=too-many-arguments
 
         except Exception as exc:
             logger.debug(traceback.format_exc())
-            logger.error(f"Error trying to PATCH {get_log_name(source)}: {exc}")
+            logger.warning(f"Error trying to PATCH {get_log_name(source)}: {exc}")
 
         return None
 

diff --git a/ingestion/src/metadata/ingestion/sink/metadata_rest.py b/ingestion/src/metadata/ingestion/sink/metadata_rest.py
@@ -586,11 +586,9 @@ def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]:
                 table=record.table, column_tags=record.column_tags
             )
             if not patched:
-                self.status.failed(
-                    StackTraceError(
-                        name=table.fullyQualifiedName.root,
-                        error="Error patching tags for table",
-                    )
+                self.status.warning(
+                    key=table.fullyQualifiedName.root,
+                    reason="Error patching tags for table",
                 )
             else:
                 logger.debug(

diff --git a/ingestion/src/metadata/profiler/source/metadata.py b/ingestion/src/metadata/profiler/source/metadata.py
@@ -12,7 +12,7 @@
 OpenMetadata source for the profiler
 """
 import traceback
-from typing import Iterable, Optional, cast
+from typing import Iterable, List, Optional, cast
 
 from pydantic import BaseModel, ConfigDict
 
@@ -265,6 +265,14 @@ def get_database_entities(self):
 
         return databases
 
+    def _get_fields(self) -> List[str]:
+        """Get the fields required to process the tables"""
+        return (
+            TABLE_FIELDS
+            if not self.source_config.processPiiSensitive
+            else TABLE_FIELDS + TAGS_FIELD
+        )
+
     def get_table_entities(self, database):
         """
         List and filter OpenMetadata tables based on the
@@ -282,9 +290,7 @@ def get_table_entities(self, database):
         """
         tables = self.metadata.list_all_entities(
             entity=Table,
-            fields=TABLE_FIELDS
-            if not self.source_config.processPiiSensitive
-            else TABLE_FIELDS + TAGS_FIELD,
+            fields=self._get_fields(),
             params={
                 "service": self.config.source.serviceName,
                 "database": fqn.build(

diff --git a/ingestion/src/metadata/profiler/source/metadata_ext.py b/ingestion/src/metadata/profiler/source/metadata_ext.py
@@ -26,7 +26,6 @@
 from sqlalchemy.inspection import inspect
 
 from metadata.generated.schema.entity.data.database import Database
-from metadata.generated.schema.entity.data.table import Table
 from metadata.generated.schema.entity.services.ingestionPipelines.status import (
     StackTraceError,
 )
@@ -138,7 +137,7 @@ def _iter(self, *_, **__) -> Iterable[Either[ProfilerSourceAndEntity]]:
                             service_name=None,
                             schema_name=schema_name,
                             table_name=table_name,
-                            fields="tableProfilerConfig",
+                            fields=",".join(self._get_fields()),
                         )
                         if not table_entity:
                             logger.debug(
@@ -226,36 +225,3 @@ def get_database_names(self) -> Iterable[str]:
         except Exception as exc:
             logger.debug(f"Failed to fetch database names {exc}")
             logger.debug(traceback.format_exc())
-
-    def get_table_entities(self, database):
-        """
-        List and filter OpenMetadata tables based on the
-        source configuration.
-
-        The listing will be based on the entities from the
-        informed service name in the source configuration.
-
-        Note that users can specify `table_filter_pattern` to
-        either be `includes` or `excludes`. This means
-        that we will either what is specified in `includes`
-        or we will use everything but the tables excluded.
-
-        Same with `schema_filter_pattern`.
-        """
-        tables = self.metadata.list_all_entities(
-            entity=Table,
-            fields=[
-                "tableProfilerConfig",
-            ],
-            params={
-                "service": self.config.source.serviceName,
-                "database": fqn.build(
-                    self.metadata,
-                    entity_type=Database,
-                    service_name=self.config.source.serviceName,
-                    database_name=database.name.root,
-                ),
-            },  # type: ignore
-        )
-
-        yield from self.filter_entities(tables)
diff --git a/ingestion/src/metadata/workflow/profiler.py b/ingestion/src/metadata/workflow/profiler.py
@@ -46,8 +46,8 @@ def _get_source_class(self):
         if self.config.source.serviceName:
             return OpenMetadataSource
         logger.info(
-            "Database Service name not provided, we will scan all the tables"
-            "available within data source and locate table entity in OpenMetadata"
+            "Database Service name not provided, we will scan all the tables "
+            "available within data source and locate table entity in OpenMetadata "
             "to ingest profiler data."
         )
         return OpenMetadataSourceExt

diff --git a/ingestion/tests/unit/pii/test_ner_scanner.py b/ingestion/tests/unit/pii/test_ner_scanner.py
@@ -11,54 +11,70 @@
 """
 Test Column Name Scanner
 """
-from unittest import TestCase
 
-from metadata.pii.scanners.ner_scanner import NERScanner
+import pytest
 
+from metadata.pii.scanners.ner_scanner import NERScanner, StringAnalysis
 
-class NERScannerTest(TestCase):
-    """
-    Validate various typical column names
-    """
 
-    ner_scanner = NERScanner()
+@pytest.fixture
+def scanner() -> NERScanner:
+    """Return the scanner"""
+    return NERScanner()
 
-    def test_scanner_none(self):
-        self.assertIsNone(self.ner_scanner.scan(list(range(100))))
-        self.assertIsNone(
-            self.ner_scanner.scan(
-                " ".split(
-                    "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
-                )
+
+def test_scanner_none(scanner):
+    assert scanner.scan(list(range(100))) is None
+    assert (
+        scanner.scan(
+            " ".split(
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam consequat quam sagittis convallis cursus."
             )
         )
+    ) is None
 
-    def test_scanner_sensitive(self):
-        self.assertEqual(
-            self.ner_scanner.scan(
-                [
-                    "[email protected]",
-                    "[email protected]",
-                    "[email protected]",
-                ]
-            ).tag_fqn,
-            "PII.Sensitive",
-        )
-        self.assertEqual(
-            self.ner_scanner.scan(
-                ["im ok", "[email protected]", "not sensitive"]
-            ).tag_fqn,
-            "PII.Sensitive",
-        )
 
-    def test_scanner_nonsensitive(self):
-        self.assertEqual(
-            self.ner_scanner.scan(
-                [
-                    "Washington",
-                    "Alaska",
-                    "Netherfield Lea Street",
-                ]
-            ).tag_fqn,
-            "PII.NonSensitive",
-        )
+def test_scanner_sensitive(scanner):
+    assert (
+        scanner.scan(
+            [
+                "[email protected]",
+                "[email protected]",
+                "[email protected]",
+            ]
+        ).tag_fqn
+        == "PII.Sensitive"
+    )
+    assert (
+        scanner.scan(["im ok", "[email protected]", "not sensitive"]).tag_fqn
+        == "PII.Sensitive"
+    )
+
+
+def test_scanner_nonsensitive(scanner):
+    assert (
+        scanner.scan(
+            [
+                "Washington",
+                "Alaska",
+                "Netherfield Lea Street",
+            ]
+        ).tag_fqn
+        == "PII.NonSensitive"
+    )
+
+
+def test_get_highest_score_label(scanner):
+    """Validate that even with score clashes, we only get one result back"""
+    assert scanner.get_highest_score_label(
+        {
+            "PII.Sensitive": StringAnalysis(score=0.9, appearances=1),
+            "PII.NonSensitive": StringAnalysis(score=0.8, appearances=1),
+        }
+    ) == ("PII.Sensitive", 0.9)
+    assert scanner.get_highest_score_label(
+        {
+            "PII.Sensitive": StringAnalysis(score=1.0, appearances=1),
+            "PII.NonSensitive": StringAnalysis(score=1.0, appearances=1),
+        }
+    ) == ("PII.Sensitive", 1.0)