Configuration File¶

The configuration file drives a lot of the reporting and anomaly detection in Qualipy. The default config.json file is created upon running qualipy generate-config through the CLI.

Some notes on the configuration:
  • how to reference metrics…

The following json has all available keys one could set.

{
    # The sqlalchemy like string that tells Qualipy where to store all data.
    # By default this is set to a sqlite file within the config directory
    "QUALIPY_DB": "sqlite:////tmp/.qualipy/qualipy.db"",
    # If using a database like postgres that supports schemas, setting this
    # would place all data in that specific schema
    "SCHEMA": "",
    # the name of the project were configuring. This corresponds to your Qualipy
    # pipeline.
    "example_project": {
        # This is where you specify anomaly specific settings
        "ANOMALY_ARGS": {
            # Each anomaly score corresponds to a standardized value. In general,
            # anything over 1 is considered an anomaly, but this could be used to
            # control the severity of the outliers
            "importance_level": 1.3,
            # This is used to set "rules" for any specific column. See what rules
            # are available to use **here (set this)
            "specific": {
                # to reference an aggregate, use run_name + column_name + metric_name + arguments (if any)
                "rows_my_column_count_": {
                    # "increasing" is just an example of a function that checks whether
                    # or not the aggregate is always increasing. This might be useful
                    # when you're inspecting the total size of a database
                    "increasing": {
                        # Can be turned on and off
                        "use": true,
                        # Since this is not a machine learning based approach, you have
                        # to set your own severity level when using custom rules
                        "severity": 3
                    }
                },
            }
        },
        # What anomaly model to use. See the Anomaly Detection guide for different
        # options
        "ANOMALY_MODEL": "prophet",
        # Date format to use on reports
        "DATE_FORMAT": "%Y-%m-%d",
        # Minimum severity level to set for filtering out numerical
        # anomalies on the anomaly report
        "NUM_SEVERITY_LEVEL": 1,
        # Minimum severity level to set for filtering out categorical
        # anomalies on the anomaly report
        "CAT_SEVERITY_LEVEL": 1,
        # Useful for categorizing anomalies based on certain thresholds
        "SEVERITY_LEVELS": {
            "low": 1.5,
            "medium": 2.5,
            "high": 10
        },
        # The following section controls the plots on the anomaly report
        "VISUALIZATION": {
            # Controls the visualizations that are displayed in the anomaly report. There
            # are 5 different categories of data to be displayed. Each one of them has their
            # own section

            # Since Qualipy by default gathers raw row counts for each data input, this section
            # will show show the overal trend of data size over time
            "row_counts": {

                # Include this if you want to view the counts of the most recent batch.
                "include_bar_of_latest": {
                    "use": true,
                    "diff": true,
                    "show_by_default": true
                },
                # Include this if you want to get a summary overview of the row counts
                "include_summary": {
                    "use": true,
                    "show_by_default": true
                }
            },

            # This section is for viewing all metrics that return a numerical data type,
            # such as float and int
            "trend": {
                "include_bar_of_latest": {
                    "use": true,
                    # You can use this to only include certain metrics
                    "variables": [
                        "measurement_concept_id_measurement_number_of_unique_",
                        "drug_concept_id_drug_number_of_unique_",
                    ],
                    "diff": false,
                    "show_by_default": true
                },
                "include_summary": {
                    "use": true,
                    "show_by_default": true
                },
                # Specify an sst to add a layer to the plot that include_summary
                # change point detection. The value refers to how far to look back
                "sst": 3,
                # Set this to true if each batch should have a point. Note, this
                # can look unappealling with a large number of batches
                "point": true,
                # Set this to include a rolling mean for each trend
                "n_steps": 10,
                # Set this if you want to include a layer in the plot that shows
                # the difference from a previous value
                "add_diff": {
                    # Set this to determine how far to look back
                    "shift": 1
                }
            },
            # Add this to visualize all categorical variables (those returning dicts
            # with counts).
            "proportion": {
            }
            # This section includes analysis on the missingness of the data
            "missing": {
                # By default, it will only show data that contains any actual missing data.
                # To also show data without any missingness, set this to True
                "include_0": true,
                "include_bar_of_latest": {
                    "use": true,
                    "diff": false
                }
            },

        },

        # This section is for customizing the metric names and hover-over descriptions,
        # in order to potentially make them more human-readable
        "DISPLAY_NAMES": {
            # This default list is automatically populated by the function name
            # and description from the function definition
            "DEFAULT": {
                "number_of_unique": {
                    "display_name": "number_of_unique_values",
                    "description": "A total count of the number of unique values in the batch"
                }
            },
            "CUSTOM": {
                "random_function": {
                    "display_name": "Random Function",
                    "description": "Description of random_function"
                }
            },
        }
    },
}