diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index d5e4ec7..4c6c7cb 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -1,3 +1,5 @@ +name: PyDeequ V2 Tests + on: push: branches: @@ -7,33 +9,65 @@ on: - "master" jobs: - test: + # V2 tests with Spark Connect (Python 3.12) + v2-tests: runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - PYSPARK_VERSION: ["3.1.3", "3.2", "3.3", "3.5"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v2 - name: Install Python 3.8 + - uses: actions/setup-python@v5 + name: Install Python 3.12 with: - python-version: 3.8 + python-version: "3.12" - - uses: actions/setup-java@v1 - name: Setup Java 11 - if: startsWith(matrix.PYSPARK_VERSION, '3') + - uses: actions/setup-java@v4 + name: Setup Java 17 with: - java-version: "11" + distribution: "corretto" + java-version: "17" - - name: Running tests with pyspark==${{matrix.PYSPARK_VERSION}} - env: - SPARK_VERSION: ${{matrix.PYSPARK_VERSION}} + - name: Download Spark 3.5 + run: | + curl -L -o spark-3.5.0-bin-hadoop3.tgz \ + https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz + tar -xzf spark-3.5.0-bin-hadoop3.tgz + echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV + + - name: Download Deequ JAR run: | - pip install --upgrade pip + curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \ + https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar + + - name: Install Python dependencies + run: | + pip install --upgrade pip setuptools pip install poetry==1.7.1 poetry install - poetry add pyspark==$SPARK_VERSION - poetry run python -m pytest -s tests + poetry add "pyspark[connect]==3.5.0" + + - name: Run V2 unit tests + run: | + poetry run pytest tests/v2/test_unit.py -v + + - name: Start Spark Connect Server + run: | + $SPARK_HOME/sbin/start-connect-server.sh \ + --packages org.apache.spark:spark-connect_2.12:3.5.0 \ + --jars $PWD/deequ_2.12-2.1.0b-spark-3.5.jar \ + --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin + # Wait for server to start + sleep 20 + # Verify server is running + ps aux | grep SparkConnectServer | grep -v grep + + - name: Run V2 integration tests + env: + SPARK_REMOTE: "sc://localhost:15002" + run: | + poetry run pytest tests/v2/ -v --ignore=tests/v2/test_unit.py + + - name: Stop Spark Connect Server + if: always() + run: | + $SPARK_HOME/sbin/stop-connect-server.sh || true diff --git a/README.md b/README.md index a6003c9..2d19db5 100644 --- a/README.md +++ b/README.md @@ -1,103 +1,489 @@ # PyDeequ -PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. PyDeequ is written to support usage of Deequ in Python. +PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library built on top of Apache Spark for defining "unit tests for data", which measure data quality in large datasets. [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) ![Coverage](https://img.shields.io/badge/coverage-90%25-green) -There are 4 main components of Deequ, and they are: -- Metrics Computation: - - `Profiles` leverages Analyzers to analyze each column of a dataset. - - `Analyzers` serve here as a foundational module that computes metrics for data profiling and validation at scale. -- Constraint Suggestion: - - Specify rules for various groups of Analyzers to be run over a dataset to return back a collection of constraints suggested to run in a Verification Suite. -- Constraint Verification: - - Perform data validation on a dataset with respect to various constraints set by you. -- Metrics Repository - - Allows for persistence and tracking of Deequ runs over time. +## What's New in PyDeequ 2.0 + +PyDeequ 2.0 introduces a new architecture using **Spark Connect**, bringing significant improvements: + +| Feature | PyDeequ 1.x | PyDeequ 2.0 | +|---------|-------------|-------------| +| Communication | Py4J (JVM bridge) | Spark Connect (gRPC) | +| Assertions | Python lambdas | Serializable predicates | +| Spark Session | Local only | Local or Remote | +| Architecture | Tight JVM coupling | Clean client-server | + +**Key Benefits:** +- **No Py4J dependency** - Uses Spark Connect protocol for communication +- **Serializable predicates** - Replace Python lambdas with predicate objects (`eq`, `gte`, `between`, etc.) +- **Remote execution** - Connect to remote Spark clusters via Spark Connect +- **Cleaner API** - Simplified imports and more Pythonic interface + +### Architecture + +```mermaid +flowchart LR + subgraph CLIENT["Python Client"] + A["Python Code"] --> B["Protobuf
Serialization"] + end + B -- gRPC --> C["Spark Connect (gRPC)"] + subgraph SERVER["Spark Connect Server"] + D["DeequRelationPlugin"] --> E["Deequ Core"] --> F["Spark DataFrame API"] --> G["(Data)"] + end + G --> H["Results"] -- gRPC --> I["Python DataFrame"] + %% Styling for compactness and distinction + classDef code fill:#C8F2FB,stroke:#35a7c2,color:#13505B,font-weight:bold; + class A code; +``` -![](imgs/pydeequ_architecture.jpg) +**How it works:** +1. **Client Side**: PyDeequ 2.0 builds checks and analyzers as Protobuf messages +2. **Transport**: Messages are sent via gRPC to the Spark Connect server +3. **Server Side**: The `DeequRelationPlugin` deserializes messages and executes Deequ operations +4. **Results**: Verification results are returned as a Spark DataFrame + +### Feature Support Matrix + +| Feature | PyDeequ 1.x | PyDeequ 2.0 | +|---------|:-----------:|:-----------:| +| **Constraint Verification** | | | +| VerificationSuite | Yes | Yes | +| Check constraints | Yes | Yes | +| Custom SQL expressions | Yes | Yes | +| **Metrics & Analysis** | | | +| AnalysisRunner | Yes | Yes | +| All standard analyzers | Yes | Yes | +| **Column Profiling** | | | +| ColumnProfilerRunner | Yes | Yes | +| Numeric statistics | Yes | Yes | +| KLL sketch profiling | Yes | Yes | +| Low-cardinality histograms | Yes | Yes | +| **Constraint Suggestions** | | | +| ConstraintSuggestionRunner | Yes | Yes | +| Rule sets (DEFAULT, EXTENDED, etc.) | Yes | Yes | +| Train/test split evaluation | Yes | Yes | +| **Metrics Repository** | | | +| FileSystemMetricsRepository | Yes | Planned | +| **Execution Mode** | | | +| Local Spark | Yes | No | +| Spark Connect (remote) | No | Yes | + +--- + +## PyDeequ 2.0 Beta - Quick Start + +### Requirements + +- Python 3.9+ +- Apache Spark 3.5.0+ +- Java 17 (Java 21+ has known compatibility issues with Spark 3.5) + +### Step 1: Download Deequ Pre-release JAR + +Download the pre-compiled Deequ JAR with Spark Connect support from the [GitHub pre-releases](https://github.com/awslabs/python-deequ/releases): -## 🎉 Announcements 🎉 -- **NEW!!!** The 1.4.0 release of Python Deequ has been published to PYPI https://pypi.org/project/pydeequ/. This release adds support for Spark 3.5.0. -- The latest version of Deequ, 2.0.7, is made available With Python Deequ 1.3.0. -- 1.1.0 release of Python Deequ has been published to PYPI https://pypi.org/project/pydeequ/. This release brings many recent upgrades including support up to Spark 3.3.0! Any feedbacks are welcome through github issues. -- With PyDeequ v0.1.8+, we now officially support Spark3 ! Just make sure you have an environment variable `SPARK_VERSION` to specify your Spark version! -- We've release a blogpost on integrating PyDeequ onto AWS leveraging services such as AWS Glue, Athena, and SageMaker! Check it out: [Monitor data quality in your data lake using PyDeequ and AWS Glue](https://aws.amazon.com/blogs/big-data/monitor-data-quality-in-your-data-lake-using-pydeequ-and-aws-glue/). -- Check out the [PyDeequ Release Announcement Blogpost](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/) with a tutorial walkthrough the Amazon Reviews dataset! -- Join the PyDeequ community on [PyDeequ Slack](https://join.slack.com/t/pydeequ/shared_invite/zt-te6bntpu-yaqPy7bhiN8Lu0NxpZs47Q) to chat with the devs! +```bash +mkdir -p ~/deequ-beta && cd ~/deequ-beta -## Quickstart +curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \ + https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar +``` -The following will quickstart you with some basic usage. For more in-depth examples, take a look in the [`tutorials/`](tutorials/) directory for executable Jupyter notebooks of each module. For documentation on supported interfaces, view the [`documentation`](https://pydeequ.readthedocs.io/). +### Step 2: Set Up Spark (if needed) -### Installation +Optional, should only be needed for quick local testing. +```bash +# Download Spark 3.5 +curl -L -o spark-3.5.0-bin-hadoop3.tgz \ + https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -You can install [PyDeequ via pip](https://pypi.org/project/pydeequ/). +tar -xzf spark-3.5.0-bin-hadoop3.tgz +export SPARK_HOME=~/deequ-beta/spark-3.5.0-bin-hadoop3 +export PATH=$SPARK_HOME/bin:$PATH ``` -pip install pydeequ + +### Step 3: Start Spark Connect Server + +Spark Connect is a client-server architecture introduced in Spark 3.4 that allows remote connectivity to Spark clusters. For more details, see the [Spark Connect Overview](https://spark.apache.org/docs/latest/spark-connect-overview.html). + +```bash +export JAVA_HOME=/path/to/java17 + +$SPARK_HOME/sbin/start-connect-server.sh \ + --packages org.apache.spark:spark-connect_2.12:3.5.0 \ + --jars ~/deequ-beta/deequ_2.12-2.1.0b-spark-3.5.jar \ + --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin +``` + +**Command explanation:** +| Option | Description | +|--------|-------------| +| `--packages` | Downloads the Spark Connect package from Maven | +| `--jars` | Loads the Deequ JAR with Spark Connect support | +| `--conf spark.connect.extensions.relation.classes` | Registers the Deequ plugin to handle custom operations | + +The server starts on `localhost:15002` by default. You can verify it's running: +```bash +ps aux | grep SparkConnectServer +``` + +### Step 4: Install PyDeequ 2.0 + +Install the beta wheel directly from the GitHub release: + +```bash +pip install https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/pydeequ-2.0.0b1-py3-none-any.whl +pip install pyspark[connect]==3.5.0 + +# Python 3.12+ users: install setuptools (provides distutils removed in 3.12) +pip install setuptools ``` -### Set up a PySpark session +### Step 5: Run Your First Check + ```python from pyspark.sql import SparkSession, Row -import pydeequ +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import VerificationSuite +from pydeequ.v2.predicates import eq, gte + +# Connect to Spark Connect server +spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + +# Create sample data +df = spark.createDataFrame([ + Row(id=1, name="Alice", age=25), + Row(id=2, name="Bob", age=30), + Row(id=3, name="Charlie", age=None), +]) + +# Define checks using the new predicate API +check = (Check(CheckLevel.Error, "Data quality checks") + .hasSize(eq(3)) + .isComplete("id") + .isComplete("name") + .hasCompleteness("age", gte(0.5)) + .isUnique("id")) + +# Run verification +result = (VerificationSuite(spark) + .onData(df) + .addCheck(check) + .run()) + +result.show(truncate=False) +spark.stop() +``` -spark = (SparkSession - .builder - .config("spark.jars.packages", pydeequ.deequ_maven_coord) - .config("spark.jars.excludes", pydeequ.f2j_maven_coord) - .getOrCreate()) +### Stop the Server -df = spark.sparkContext.parallelize([ - Row(a="foo", b=1, c=5), - Row(a="bar", b=2, c=6), - Row(a="baz", b=3, c=None)]).toDF() +```bash +$SPARK_HOME/sbin/stop-connect-server.sh ``` +### Full Example + +For a comprehensive example covering data analysis, constraint verification, column profiling, and constraint suggestions, see [tutorials/data_quality_example_v2.py](tutorials/data_quality_example_v2.py). + +--- + +## PyDeequ 2.0 API Reference + +### Predicates (replace lambdas) + +```python +from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between + +check.hasSize(eq(3)) # size == 3 +check.hasCompleteness("col", gte(0.9)) # completeness >= 0.9 +check.hasMean("value", between(10, 20)) # 10 <= mean <= 20 +``` + +| Predicate | Description | Example | +|-----------|-------------|---------| +| `eq(v)` | Equal to v | `eq(1.0)` | +| `gt(v)` | Greater than v | `gt(0)` | +| `gte(v)` | Greater than or equal | `gte(0.9)` | +| `lt(v)` | Less than v | `lt(100)` | +| `lte(v)` | Less than or equal | `lte(1.0)` | +| `between(a, b)` | Between a and b (inclusive) | `between(0, 1)` | + ### Analyzers ```python -from pydeequ.analyzers import * +from pydeequ.v2.verification import AnalysisRunner +from pydeequ.v2.analyzers import ( + Size, Completeness, Mean, Sum, Minimum, Maximum, + StandardDeviation, ApproxCountDistinct, Distinctness, + Uniqueness, Entropy, Correlation +) + +result = (AnalysisRunner(spark) + .onData(df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("name")) + .addAnalyzer(Mean("age")) + .run()) + +result.show() +``` -analysisResult = AnalysisRunner(spark) \ - .onData(df) \ - .addAnalyzer(Size()) \ - .addAnalyzer(Completeness("b")) \ - .run() +### Constraint Methods + +| Method | Description | +|--------|-------------| +| `hasSize(predicate)` | Check total row count | +| `isComplete(column)` | Check column has no nulls | +| `hasCompleteness(column, predicate)` | Check completeness ratio | +| `areComplete(columns)` | Check multiple columns have no nulls | +| `isUnique(column)` | Check column values are unique | +| `hasUniqueness(columns, predicate)` | Check uniqueness ratio | +| `hasDistinctness(columns, predicate)` | Check distinctness ratio | +| `hasMin(column, predicate)` | Check minimum value | +| `hasMax(column, predicate)` | Check maximum value | +| `hasMean(column, predicate)` | Check mean value | +| `hasSum(column, predicate)` | Check sum | +| `hasStandardDeviation(column, predicate)` | Check standard deviation | +| `hasApproxCountDistinct(column, predicate)` | Check approximate distinct count | +| `hasCorrelation(col1, col2, predicate)` | Check correlation between columns | +| `hasEntropy(column, predicate)` | Check entropy | +| `hasApproxQuantile(column, quantile, predicate)` | Check approximate quantile | +| `satisfies(expression, name, predicate)` | Custom SQL expression | +| `hasPattern(column, pattern, predicate)` | Check regex pattern match ratio | +| `containsEmail(column, predicate)` | Check email format ratio | +| `containsCreditCardNumber(column, predicate)` | Check credit card format ratio | +| `isNonNegative(column)` | Check all values >= 0 | +| `isPositive(column)` | Check all values > 0 | + +### Column Profiler + +Profile column distributions and statistics across your dataset: -analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult) -analysisResult_df.show() +```python +from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters + +# Basic profiling +profiles = (ColumnProfilerRunner(spark) + .onData(df) + .run()) + +profiles.show() + +# Advanced profiling with options +profiles = (ColumnProfilerRunner(spark) + .onData(df) + .restrictToColumns(["id", "name", "age"]) # Profile specific columns + .withLowCardinalityHistogramThreshold(100) # Generate histograms for low-cardinality columns + .withKLLProfiling() # Enable KLL sketch for approximate quantiles + .setKLLParameters(KLLParameters( + sketch_size=2048, + shrinking_factor=0.64, + num_buckets=64 + )) + .run()) ``` -### Profile +**Profile Result Schema:** + +| Column | Type | Description | +|--------|------|-------------| +| `column` | STRING | Column name | +| `completeness` | DOUBLE | Non-null ratio (0.0-1.0) | +| `approx_distinct_values` | LONG | Approximate cardinality | +| `data_type` | STRING | Detected data type | +| `is_data_type_inferred` | BOOLEAN | Whether type was inferred | +| `type_counts` | STRING | JSON of type distribution | +| `histogram` | STRING | JSON histogram (low cardinality only) | +| `mean` | DOUBLE | Mean (numeric columns only) | +| `minimum` | DOUBLE | Minimum value (numeric only) | +| `maximum` | DOUBLE | Maximum value (numeric only) | +| `sum` | DOUBLE | Sum (numeric only) | +| `std_dev` | DOUBLE | Standard deviation (numeric only) | +| `approx_percentiles` | STRING | JSON percentiles (numeric only) | +| `kll_buckets` | STRING | JSON KLL buckets (if enabled) | + +### Constraint Suggestions + +Auto-generate data quality constraints based on your data: ```python -from pydeequ.profiles import * +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + +# Basic suggestion generation +suggestions = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run()) + +suggestions.show(truncate=False) + +# Advanced usage with train/test evaluation +suggestions = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .addConstraintRules(Rules.EXTENDED) + .restrictToColumns(["id", "status", "score"]) + .useTrainTestSplitWithTestsetRatio(0.2, seed=42) # Evaluate suggestions on test set + .run()) +``` -result = ColumnProfilerRunner(spark) \ - .onData(df) \ - .run() +**Available Rule Sets:** + +| Rule Set | Description | +|----------|-------------| +| `Rules.DEFAULT` | Completeness, type, categorical range, non-negative | +| `Rules.STRING` | String length constraints (min/max length) | +| `Rules.NUMERICAL` | Numeric constraints (min, max, mean, stddev) | +| `Rules.COMMON` | Uniqueness for approximately unique columns | +| `Rules.EXTENDED` | All rules combined | + +**Suggestion Result Schema:** + +| Column | Type | Description | +|--------|------|-------------| +| `column_name` | STRING | Column the constraint applies to | +| `constraint_name` | STRING | Type of constraint | +| `current_value` | STRING | Current metric value | +| `description` | STRING | Human-readable description | +| `suggesting_rule` | STRING | Rule that generated this | +| `code_for_constraint` | STRING | Python code snippet | +| `evaluation_status` | STRING | "Success" or "Failure" (if train/test enabled) | +| `evaluation_metric_value` | DOUBLE | Metric value on test set | + +### Migration from 1.x to 2.0 + +**Import changes:** +```python +# Before (1.x) +from pydeequ.checks import Check, CheckLevel +from pydeequ.verification import VerificationSuite + +# After (2.0) +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import VerificationSuite +from pydeequ.v2.predicates import eq, gte, between +``` + +**Lambda to predicate:** +```python +# Before (1.x) +check.hasSize(lambda x: x == 3) +check.hasCompleteness("col", lambda x: x >= 0.9) +# After (2.0) +check.hasSize(eq(3)) +check.hasCompleteness("col", gte(0.9)) +``` + +**Profiler changes:** +```python +# Before (1.x) - returns Python object +from pydeequ.profiles import ColumnProfilerRunner +result = ColumnProfilerRunner(spark).onData(df).run() for col, profile in result.profiles.items(): print(profile) + +# After (2.0) - returns DataFrame +from pydeequ.v2.profiles import ColumnProfilerRunner +result = ColumnProfilerRunner(spark).onData(df).run() +result.show() ``` -### Constraint Suggestions +**Suggestions changes:** +```python +# Before (1.x) - returns Python object +from pydeequ.suggestions import ConstraintSuggestionRunner, DEFAULT +result = ConstraintSuggestionRunner(spark).onData(df).addConstraintRule(DEFAULT()).run() +print(result) + +# After (2.0) - returns DataFrame +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules +result = ConstraintSuggestionRunner(spark).onData(df).addConstraintRules(Rules.DEFAULT).run() +result.show() +``` + +--- + +## PyDeequ 2.0 Troubleshooting + +### Server won't start +1. Check Java version: `java -version` (must be Java 17, not 21+) +2. Check port availability: `lsof -i :15002` +3. Check logs: `tail -f $SPARK_HOME/logs/spark-*-SparkConnectServer-*.out` + +### Connection refused +Ensure the Spark Connect server is running: +```bash +ps aux | grep SparkConnectServer +``` + +### ClassNotFoundException: DeequRelationPlugin +Ensure the Deequ JAR is correctly specified in `--jars` when starting the server. + +### UnsupportedOperationException: sun.misc.Unsafe not available +This error occurs when using Java 21+ with Spark 3.5. Use Java 17 instead: +```bash +export JAVA_HOME=/path/to/java17 +``` + +### ModuleNotFoundError: No module named 'distutils' +This occurs on Python 3.12+ because `distutils` was removed. Install setuptools: +```bash +pip install setuptools +``` + +--- + +## PyDeequ 1.x (Legacy) + +The legacy PyDeequ API uses Py4J for JVM communication. It is still available for backward compatibility. + +### Installation + +```bash +pip install pydeequ +``` + +**Note:** Set the `SPARK_VERSION` environment variable to match your Spark version. + +### Quick Start (1.x) ```python -from pydeequ.suggestions import * +from pyspark.sql import SparkSession, Row +import pydeequ -suggestionResult = ConstraintSuggestionRunner(spark) \ - .onData(df) \ - .addConstraintRule(DEFAULT()) \ - .run() +spark = (SparkSession + .builder + .config("spark.jars.packages", pydeequ.deequ_maven_coord) + .config("spark.jars.excludes", pydeequ.f2j_maven_coord) + .getOrCreate()) -# Constraint Suggestions in JSON format -print(suggestionResult) +df = spark.sparkContext.parallelize([ + Row(a="foo", b=1, c=5), + Row(a="bar", b=2, c=6), + Row(a="baz", b=3, c=None) +]).toDF() +``` + +### Analyzers (1.x) + +```python +from pydeequ.analyzers import * + +analysisResult = AnalysisRunner(spark) \ + .onData(df) \ + .addAnalyzer(Size()) \ + .addAnalyzer(Completeness("b")) \ + .run() + +analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult) +analysisResult_df.show() ``` -### Constraint Verification +### Constraint Verification (1.x) ```python from pydeequ.checks import * @@ -110,8 +496,8 @@ checkResult = VerificationSuite(spark) \ .addCheck( check.hasSize(lambda x: x >= 3) \ .hasMin("b", lambda x: x == 0) \ - .isComplete("c") \ - .isUnique("a") \ + .isComplete("c") \ + .isUnique("a") \ .isContainedIn("a", ["foo", "bar", "baz"]) \ .isNonNegative("b")) \ .run() @@ -120,9 +506,34 @@ checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult) checkResult_df.show() ``` -### Repository +### Profile (1.x) + +```python +from pydeequ.profiles import * + +result = ColumnProfilerRunner(spark) \ + .onData(df) \ + .run() + +for col, profile in result.profiles.items(): + print(profile) +``` + +### Constraint Suggestions (1.x) + +```python +from pydeequ.suggestions import * + +suggestionResult = ConstraintSuggestionRunner(spark) \ + .onData(df) \ + .addConstraintRule(DEFAULT()) \ + .run() + +print(suggestionResult) +``` + +### Repository (1.x) -Save to a Metrics Repository by adding the `useRepository()` and `saveOrAppendResult()` calls to your Analysis Runner. ```python from pydeequ.repository import * from pydeequ.analyzers import * @@ -140,120 +551,107 @@ analysisResult = AnalysisRunner(spark) \ .run() ``` -To load previous runs, use the `repository` object to load previous results back in. - -```python -result_metrep_df = repository.load() \ - .before(ResultKey.current_milli_time()) \ - .forAnalyzers([ApproxCountDistinct('b')]) \ - .getSuccessMetricsAsDataFrame() -``` - -### Wrapping up - -After you've ran your jobs with PyDeequ, be sure to shut down your Spark session to prevent any hanging processes. +### Wrapping Up (1.x) ```python spark.sparkContext._gateway.shutdown_callback_server() spark.stop() ``` -## [Contributing](https://github.com/awslabs/python-deequ/blob/master/CONTRIBUTING.md) -Please refer to the [contributing doc](https://github.com/awslabs/python-deequ/blob/master/CONTRIBUTING.md) for how to contribute to PyDeequ. +--- -## [License](https://github.com/awslabs/python-deequ/blob/master/LICENSE) +## Deequ Components -This library is licensed under the Apache 2.0 License. +There are 4 main components of Deequ: -****** +- **Metrics Computation** + - `Profiles` leverages Analyzers to analyze each column of a dataset. + - `Analyzers` compute metrics for data profiling and validation at scale. +- **Constraint Suggestion** + - Specify rules for Analyzers to return suggested constraints. +- **Constraint Verification** + - Validate data against constraints you define. +- **Metrics Repository** + - Persist and track Deequ runs over time. -## Contributing Developer Setup +![](imgs/pydeequ_architecture.jpg) -1. Setup [SDKMAN](#setup-sdkman) -1. Setup [Java](#setup-java) -1. Setup [Apache Spark](#setup-apache-spark) -1. Install [Poetry](#poetry) -1. Run [tests locally](#running-tests-locally) +--- -### Setup SDKMAN +## Feedback and Issues -SDKMAN is a tool for managing parallel Versions of multiple Software Development Kits on any Unix based -system. It provides a convenient command line interface for installing, switching, removing and listing -Candidates. SDKMAN! installs smoothly on Mac OSX, Linux, WSL, Cygwin, etc... Support Bash and ZSH shells. See -documentation on the [SDKMAN! website](https://sdkman.io). +Please report any issues or feedback to: +- GitHub Issues: https://github.com/awslabs/deequ/issues +- Tag PyDeequ 2.0 issues with `pydeequ-2.0` -Open your favourite terminal and enter the following: +When reporting issues, include: +1. Python version +2. Spark version +3. Java version +4. Operating system +5. Full error message and stack trace +6. Minimal code to reproduce -```bash -$ curl -s https://get.sdkman.io | bash -If the environment needs tweaking for SDKMAN to be installed, -the installer will prompt you accordingly and ask you to restart. +--- -Next, open a new terminal or enter: +## Contributing -$ source "$HOME/.sdkman/bin/sdkman-init.sh" +Please refer to the [contributing doc](https://github.com/awslabs/python-deequ/blob/master/CONTRIBUTING.md) for how to contribute to PyDeequ. -Lastly, run the following code snippet to ensure that installation succeeded: +## License -$ sdk version -``` +This library is licensed under the Apache 2.0 License. -### Setup Java +--- -Install Java Now open favourite terminal and enter the following: +## Developer Setup + +1. Setup [SDKMAN](#setup-sdkman) +2. Setup [Java](#setup-java) +3. Setup [Apache Spark](#setup-apache-spark) +4. Install [Poetry](#poetry) +5. Run [tests locally](#running-tests-locally) + +### Setup SDKMAN ```bash -List the AdoptOpenJDK OpenJDK versions -$ sdk list java +curl -s https://get.sdkman.io | bash +source "$HOME/.sdkman/bin/sdkman-init.sh" +sdk version +``` -To install For Java 11 -$ sdk install java 11.0.10.hs-adpt +### Setup Java -To install For Java 11 -$ sdk install java 8.0.292.hs-adpt +```bash +sdk list java +sdk install java 17.0.9-amzn # For PyDeequ 2.0 +sdk install java 11.0.10.hs-adpt # For PyDeequ 1.x ``` ### Setup Apache Spark -Install Java Now open favourite terminal and enter the following: - ```bash -List the Apache Spark versions: -$ sdk list spark - -To install For Spark 3 -$ sdk install spark 3.0.2 +sdk list spark +sdk install spark 3.5.0 ``` ### Poetry -Poetry [Commands](https://python-poetry.org/docs/cli/#search) - ```bash poetry install - poetry update - -# --tree: List the dependencies as a tree. -# --latest (-l): Show the latest version. -# --outdated (-o): Show the latest version but only for packages that are outdated. poetry show -o ``` -## Running Tests Locally - -Take a look at tests in `tests/dataquality` and `tests/jobs` +### Running Tests Locally ```bash -$ poetry run pytest +poetry run pytest ``` -## Running Tests Locally (Docker) - -If you have issues installing the dependencies listed above, another way to run the tests and verify your changes is through Docker. There is a Dockerfile that will install the required dependencies and run the tests in a container. +### Running Tests (Docker) +```bash +docker build . -t spark-3.5-docker-test +docker run spark-3.5-docker-test ``` -docker build . -t spark-3.3-docker-test -docker run spark-3.3-docker-test -``` - diff --git a/poetry.lock b/poetry.lock index 164c6c3..5b439ef 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,201 +1,71 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. - -[[package]] -name = "atomicwrites" -version = "1.4.1" -description = "Atomic file writes." -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, -] - -[[package]] -name = "attrs" -version = "22.1.0" -description = "Classes Without Boilerplate" -optional = false -python-versions = ">=3.5" -files = [ - {file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"}, - {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"}, -] - -[package.extras] -dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"] -docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] -tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] -tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "black" -version = "21.12b0" +version = "24.10.0" description = "The uncompromising code formatter." optional = false -python-versions = ">=3.6.2" -files = [ - {file = "black-21.12b0-py3-none-any.whl", hash = "sha256:a615e69ae185e08fdd73e4715e260e2479c861b5740057fde6e8b4e3b7dd589f"}, - {file = "black-21.12b0.tar.gz", hash = "sha256:77b80f693a569e2e527958459634f18df9b0ba2625ba4e0c2d5da5be42e6f2b3"}, +python-versions = ">=3.9" +files = [ + {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"}, + {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"}, + {file = "black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f"}, + {file = "black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e"}, + {file = "black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad"}, + {file = "black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50"}, + {file = "black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392"}, + {file = "black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175"}, + {file = "black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3"}, + {file = "black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65"}, + {file = "black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f"}, + {file = "black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8"}, + {file = "black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981"}, + {file = "black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b"}, + {file = "black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2"}, + {file = "black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b"}, + {file = "black-24.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:17374989640fbca88b6a448129cd1745c5eb8d9547b464f281b251dd00155ccd"}, + {file = "black-24.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:63f626344343083322233f175aaf372d326de8436f5928c042639a4afbbf1d3f"}, + {file = "black-24.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfa1d0cb6200857f1923b602f978386a3a2758a65b52e0950299ea014be6800"}, + {file = "black-24.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cd9c95431d94adc56600710f8813ee27eea544dd118d45896bb734e9d7a0dc7"}, + {file = "black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d"}, + {file = "black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875"}, ] [package.dependencies] -click = ">=7.1.2" +click = ">=8.0.0" mypy-extensions = ">=0.4.3" -pathspec = ">=0.9.0,<1" +packaging = ">=22.0" +pathspec = ">=0.9.0" platformdirs = ">=2" -tomli = ">=0.2.6,<2.0.0" -typing-extensions = [ - {version = ">=3.10.0.0,<3.10.0.1 || >3.10.0.1", markers = "python_version >= \"3.10\""}, - {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, -] +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} [package.extras] colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)"] +d = ["aiohttp (>=3.10)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -python2 = ["typed-ast (>=1.4.3)"] uvloop = ["uvloop (>=0.15.2)"] -[[package]] -name = "bleach" -version = "5.0.1" -description = "An easy safelist-based HTML-sanitizing tool." -optional = false -python-versions = ">=3.7" -files = [ - {file = "bleach-5.0.1-py3-none-any.whl", hash = "sha256:085f7f33c15bd408dd9b17a4ad77c577db66d76203e5984b1bd59baeee948b2a"}, - {file = "bleach-5.0.1.tar.gz", hash = "sha256:0d03255c47eb9bd2f26aa9bb7f2107732e7e8fe195ca2f64709fcf3b0a4a085c"}, -] - -[package.dependencies] -six = ">=1.9.0" -webencodings = "*" - -[package.extras] -css = ["tinycss2 (>=1.1.0,<1.2)"] -dev = ["Sphinx (==4.3.2)", "black (==22.3.0)", "build (==0.8.0)", "flake8 (==4.0.1)", "hashin (==0.17.0)", "mypy (==0.961)", "pip-tools (==6.6.2)", "pytest (==7.1.2)", "tox (==3.25.0)", "twine (==4.0.1)", "wheel (==0.37.1)"] - -[[package]] -name = "certifi" -version = "2024.7.4" -description = "Python package for providing Mozilla's CA Bundle." -optional = false -python-versions = ">=3.6" -files = [ - {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, - {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, -] - -[[package]] -name = "cffi" -version = "1.15.1" -description = "Foreign Function Interface for Python calling C code." -optional = false -python-versions = "*" -files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, -] - -[package.dependencies] -pycparser = "*" - [[package]] name = "cfgv" -version = "3.3.1" +version = "3.4.0" description = "Validate configuration and produce human readable error messages." optional = false -python-versions = ">=3.6.1" -files = [ - {file = "cfgv-3.3.1-py2.py3-none-any.whl", hash = "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426"}, - {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"}, -] - -[[package]] -name = "charset-normalizer" -version = "2.1.1" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = false -python-versions = ">=3.6.0" +python-versions = ">=3.8" files = [ - {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, - {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, ] -[package.extras] -unicode-backport = ["unicodedata2"] - [[package]] name = "click" -version = "8.1.3" +version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, - {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, ] [package.dependencies] @@ -203,535 +73,561 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "colorama" -version = "0.4.5" +version = "0.4.6" description = "Cross-platform colored terminal text." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ - {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, - {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] [[package]] name = "coverage" -version = "5.5" +version = "7.10.7" description = "Code coverage measurement for Python" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" -files = [ - {file = "coverage-5.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:b6d534e4b2ab35c9f93f46229363e17f63c53ad01330df9f2d6bd1187e5eaacf"}, - {file = "coverage-5.5-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:b7895207b4c843c76a25ab8c1e866261bcfe27bfaa20c192de5190121770672b"}, - {file = "coverage-5.5-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:c2723d347ab06e7ddad1a58b2a821218239249a9e4365eaff6649d31180c1669"}, - {file = "coverage-5.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:900fbf7759501bc7807fd6638c947d7a831fc9fdf742dc10f02956ff7220fa90"}, - {file = "coverage-5.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c"}, - {file = "coverage-5.5-cp27-cp27m-win32.whl", hash = "sha256:06191eb60f8d8a5bc046f3799f8a07a2d7aefb9504b0209aff0b47298333302a"}, - {file = "coverage-5.5-cp27-cp27m-win_amd64.whl", hash = "sha256:7501140f755b725495941b43347ba8a2777407fc7f250d4f5a7d2a1050ba8e82"}, - {file = "coverage-5.5-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:372da284cfd642d8e08ef606917846fa2ee350f64994bebfbd3afb0040436905"}, - {file = "coverage-5.5-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:8963a499849a1fc54b35b1c9f162f4108017b2e6db2c46c1bed93a72262ed083"}, - {file = "coverage-5.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:869a64f53488f40fa5b5b9dcb9e9b2962a66a87dab37790f3fcfb5144b996ef5"}, - {file = "coverage-5.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:4a7697d8cb0f27399b0e393c0b90f0f1e40c82023ea4d45d22bce7032a5d7b81"}, - {file = "coverage-5.5-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:8d0a0725ad7c1a0bcd8d1b437e191107d457e2ec1084b9f190630a4fb1af78e6"}, - {file = "coverage-5.5-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:51cb9476a3987c8967ebab3f0fe144819781fca264f57f89760037a2ea191cb0"}, - {file = "coverage-5.5-cp310-cp310-win_amd64.whl", hash = "sha256:c0891a6a97b09c1f3e073a890514d5012eb256845c451bd48f7968ef939bf4ae"}, - {file = "coverage-5.5-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3487286bc29a5aa4b93a072e9592f22254291ce96a9fbc5251f566b6b7343cdb"}, - {file = "coverage-5.5-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:deee1077aae10d8fa88cb02c845cfba9b62c55e1183f52f6ae6a2df6a2187160"}, - {file = "coverage-5.5-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6"}, - {file = "coverage-5.5-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:6c90e11318f0d3c436a42409f2749ee1a115cd8b067d7f14c148f1ce5574d701"}, - {file = "coverage-5.5-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:30c77c1dc9f253283e34c27935fded5015f7d1abe83bc7821680ac444eaf7793"}, - {file = "coverage-5.5-cp35-cp35m-win32.whl", hash = "sha256:9a1ef3b66e38ef8618ce5fdc7bea3d9f45f3624e2a66295eea5e57966c85909e"}, - {file = "coverage-5.5-cp35-cp35m-win_amd64.whl", hash = "sha256:972c85d205b51e30e59525694670de6a8a89691186012535f9d7dbaa230e42c3"}, - {file = "coverage-5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:af0e781009aaf59e25c5a678122391cb0f345ac0ec272c7961dc5455e1c40066"}, - {file = "coverage-5.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:74d881fc777ebb11c63736622b60cb9e4aee5cace591ce274fb69e582a12a61a"}, - {file = "coverage-5.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b017ce34b68a7d67bd6d117e6d443a9bf63a2ecf8567bb3d8c6c7bc5014465"}, - {file = "coverage-5.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:d636598c8305e1f90b439dbf4f66437de4a5e3c31fdf47ad29542478c8508bbb"}, - {file = "coverage-5.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:41179b8a845742d1eb60449bdb2992196e211341818565abded11cfa90efb821"}, - {file = "coverage-5.5-cp36-cp36m-win32.whl", hash = "sha256:040af6c32813fa3eae5305d53f18875bedd079960822ef8ec067a66dd8afcd45"}, - {file = "coverage-5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:5fec2d43a2cc6965edc0bb9e83e1e4b557f76f843a77a2496cbe719583ce8184"}, - {file = "coverage-5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:18ba8bbede96a2c3dde7b868de9dcbd55670690af0988713f0603f037848418a"}, - {file = "coverage-5.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2910f4d36a6a9b4214bb7038d537f015346f413a975d57ca6b43bf23d6563b53"}, - {file = "coverage-5.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d"}, - {file = "coverage-5.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:796c9c3c79747146ebd278dbe1e5c5c05dd6b10cc3bcb8389dfdf844f3ead638"}, - {file = "coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:53194af30d5bad77fcba80e23a1441c71abfb3e01192034f8246e0d8f99528f3"}, - {file = "coverage-5.5-cp37-cp37m-win32.whl", hash = "sha256:184a47bbe0aa6400ed2d41d8e9ed868b8205046518c52464fde713ea06e3a74a"}, - {file = "coverage-5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2949cad1c5208b8298d5686d5a85b66aae46d73eec2c3e08c817dd3513e5848a"}, - {file = "coverage-5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:217658ec7187497e3f3ebd901afdca1af062b42cfe3e0dafea4cced3983739f6"}, - {file = "coverage-5.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1aa846f56c3d49205c952d8318e76ccc2ae23303351d9270ab220004c580cfe2"}, - {file = "coverage-5.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:24d4a7de75446be83244eabbff746d66b9240ae020ced65d060815fac3423759"}, - {file = "coverage-5.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d1f8bf7b90ba55699b3a5e44930e93ff0189aa27186e96071fac7dd0d06a1873"}, - {file = "coverage-5.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:970284a88b99673ccb2e4e334cfb38a10aab7cd44f7457564d11898a74b62d0a"}, - {file = "coverage-5.5-cp38-cp38-win32.whl", hash = "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6"}, - {file = "coverage-5.5-cp38-cp38-win_amd64.whl", hash = "sha256:2e0d881ad471768bf6e6c2bf905d183543f10098e3b3640fc029509530091502"}, - {file = "coverage-5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1f9ce122f83b2305592c11d64f181b87153fc2c2bbd3bb4a3dde8303cfb1a6b"}, - {file = "coverage-5.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:13c4ee887eca0f4c5a247b75398d4114c37882658300e153113dafb1d76de529"}, - {file = "coverage-5.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52596d3d0e8bdf3af43db3e9ba8dcdaac724ba7b5ca3f6358529d56f7a166f8b"}, - {file = "coverage-5.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2cafbbb3af0733db200c9b5f798d18953b1a304d3f86a938367de1567f4b5bff"}, - {file = "coverage-5.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:44d654437b8ddd9eee7d1eaee28b7219bec228520ff809af170488fd2fed3e2b"}, - {file = "coverage-5.5-cp39-cp39-win32.whl", hash = "sha256:d314ed732c25d29775e84a960c3c60808b682c08d86602ec2c3008e1202e3bb6"}, - {file = "coverage-5.5-cp39-cp39-win_amd64.whl", hash = "sha256:13034c4409db851670bc9acd836243aeee299949bd5673e11844befcb0149f03"}, - {file = "coverage-5.5-pp36-none-any.whl", hash = "sha256:f030f8873312a16414c0d8e1a1ddff2d3235655a2174e3648b4fa66b3f2f1079"}, - {file = "coverage-5.5-pp37-none-any.whl", hash = "sha256:2a3859cb82dcbda1cfd3e6f71c27081d18aa251d20a17d87d26d4cd216fb0af4"}, - {file = "coverage-5.5.tar.gz", hash = "sha256:ebe78fe9a0e874362175b02371bdfbee64d8edc42a044253ddf4ee7d3c15212c"}, -] - -[package.extras] -toml = ["toml"] - -[[package]] -name = "cryptography" -version = "42.0.4" -description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -optional = false -python-versions = ">=3.7" -files = [ - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, - {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, - {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, - {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, - {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, - {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, - {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, +python-versions = ">=3.9" +files = [ + {file = "coverage-7.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fc04cc7a3db33664e0c2d10eb8990ff6b3536f6842c9590ae8da4c614b9ed05a"}, + {file = "coverage-7.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e201e015644e207139f7e2351980feb7040e6f4b2c2978892f3e3789d1c125e5"}, + {file = "coverage-7.10.7-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:240af60539987ced2c399809bd34f7c78e8abe0736af91c3d7d0e795df633d17"}, + {file = "coverage-7.10.7-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8421e088bc051361b01c4b3a50fd39a4b9133079a2229978d9d30511fd05231b"}, + {file = "coverage-7.10.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6be8ed3039ae7f7ac5ce058c308484787c86e8437e72b30bf5e88b8ea10f3c87"}, + {file = "coverage-7.10.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e28299d9f2e889e6d51b1f043f58d5f997c373cc12e6403b90df95b8b047c13e"}, + {file = "coverage-7.10.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c4e16bd7761c5e454f4efd36f345286d6f7c5fa111623c355691e2755cae3b9e"}, + {file = "coverage-7.10.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b1c81d0e5e160651879755c9c675b974276f135558cf4ba79fee7b8413a515df"}, + {file = "coverage-7.10.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:606cc265adc9aaedcc84f1f064f0e8736bc45814f15a357e30fca7ecc01504e0"}, + {file = "coverage-7.10.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:10b24412692df990dbc34f8fb1b6b13d236ace9dfdd68df5b28c2e39cafbba13"}, + {file = "coverage-7.10.7-cp310-cp310-win32.whl", hash = "sha256:b51dcd060f18c19290d9b8a9dd1e0181538df2ce0717f562fff6cf74d9fc0b5b"}, + {file = "coverage-7.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:3a622ac801b17198020f09af3eaf45666b344a0d69fc2a6ffe2ea83aeef1d807"}, + {file = "coverage-7.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a609f9c93113be646f44c2a0256d6ea375ad047005d7f57a5c15f614dc1b2f59"}, + {file = "coverage-7.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:65646bb0359386e07639c367a22cf9b5bf6304e8630b565d0626e2bdf329227a"}, + {file = "coverage-7.10.7-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5f33166f0dfcce728191f520bd2692914ec70fac2713f6bf3ce59c3deacb4699"}, + {file = "coverage-7.10.7-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:35f5e3f9e455bb17831876048355dca0f758b6df22f49258cb5a91da23ef437d"}, + {file = "coverage-7.10.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4da86b6d62a496e908ac2898243920c7992499c1712ff7c2b6d837cc69d9467e"}, + {file = "coverage-7.10.7-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6b8b09c1fad947c84bbbc95eca841350fad9cbfa5a2d7ca88ac9f8d836c92e23"}, + {file = "coverage-7.10.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:4376538f36b533b46f8971d3a3e63464f2c7905c9800db97361c43a2b14792ab"}, + {file = "coverage-7.10.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:121da30abb574f6ce6ae09840dae322bef734480ceafe410117627aa54f76d82"}, + {file = "coverage-7.10.7-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:88127d40df529336a9836870436fc2751c339fbaed3a836d42c93f3e4bd1d0a2"}, + {file = "coverage-7.10.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ba58bbcd1b72f136080c0bccc2400d66cc6115f3f906c499013d065ac33a4b61"}, + {file = "coverage-7.10.7-cp311-cp311-win32.whl", hash = "sha256:972b9e3a4094b053a4e46832b4bc829fc8a8d347160eb39d03f1690316a99c14"}, + {file = "coverage-7.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:a7b55a944a7f43892e28ad4bc0561dfd5f0d73e605d1aa5c3c976b52aea121d2"}, + {file = "coverage-7.10.7-cp311-cp311-win_arm64.whl", hash = "sha256:736f227fb490f03c6488f9b6d45855f8e0fd749c007f9303ad30efab0e73c05a"}, + {file = "coverage-7.10.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7bb3b9ddb87ef7725056572368040c32775036472d5a033679d1fa6c8dc08417"}, + {file = "coverage-7.10.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:18afb24843cbc175687225cab1138c95d262337f5473512010e46831aa0c2973"}, + {file = "coverage-7.10.7-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:399a0b6347bcd3822be369392932884b8216d0944049ae22925631a9b3d4ba4c"}, + {file = "coverage-7.10.7-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:314f2c326ded3f4b09be11bc282eb2fc861184bc95748ae67b360ac962770be7"}, + {file = "coverage-7.10.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c41e71c9cfb854789dee6fc51e46743a6d138b1803fab6cb860af43265b42ea6"}, + {file = "coverage-7.10.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc01f57ca26269c2c706e838f6422e2a8788e41b3e3c65e2f41148212e57cd59"}, + {file = "coverage-7.10.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a6442c59a8ac8b85812ce33bc4d05bde3fb22321fa8294e2a5b487c3505f611b"}, + {file = "coverage-7.10.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:78a384e49f46b80fb4c901d52d92abe098e78768ed829c673fbb53c498bef73a"}, + {file = "coverage-7.10.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5e1e9802121405ede4b0133aa4340ad8186a1d2526de5b7c3eca519db7bb89fb"}, + {file = "coverage-7.10.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d41213ea25a86f69efd1575073d34ea11aabe075604ddf3d148ecfec9e1e96a1"}, + {file = "coverage-7.10.7-cp312-cp312-win32.whl", hash = "sha256:77eb4c747061a6af8d0f7bdb31f1e108d172762ef579166ec84542f711d90256"}, + {file = "coverage-7.10.7-cp312-cp312-win_amd64.whl", hash = "sha256:f51328ffe987aecf6d09f3cd9d979face89a617eacdaea43e7b3080777f647ba"}, + {file = "coverage-7.10.7-cp312-cp312-win_arm64.whl", hash = "sha256:bda5e34f8a75721c96085903c6f2197dc398c20ffd98df33f866a9c8fd95f4bf"}, + {file = "coverage-7.10.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:981a651f543f2854abd3b5fcb3263aac581b18209be49863ba575de6edf4c14d"}, + {file = "coverage-7.10.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:73ab1601f84dc804f7812dc297e93cd99381162da39c47040a827d4e8dafe63b"}, + {file = "coverage-7.10.7-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a8b6f03672aa6734e700bbcd65ff050fd19cddfec4b031cc8cf1c6967de5a68e"}, + {file = "coverage-7.10.7-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10b6ba00ab1132a0ce4428ff68cf50a25efd6840a42cdf4239c9b99aad83be8b"}, + {file = "coverage-7.10.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c79124f70465a150e89340de5963f936ee97097d2ef76c869708c4248c63ca49"}, + {file = "coverage-7.10.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:69212fbccdbd5b0e39eac4067e20a4a5256609e209547d86f740d68ad4f04911"}, + {file = "coverage-7.10.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7ea7c6c9d0d286d04ed3541747e6597cbe4971f22648b68248f7ddcd329207f0"}, + {file = "coverage-7.10.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b9be91986841a75042b3e3243d0b3cb0b2434252b977baaf0cd56e960fe1e46f"}, + {file = "coverage-7.10.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:b281d5eca50189325cfe1f365fafade89b14b4a78d9b40b05ddd1fc7d2a10a9c"}, + {file = "coverage-7.10.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:99e4aa63097ab1118e75a848a28e40d68b08a5e19ce587891ab7fd04475e780f"}, + {file = "coverage-7.10.7-cp313-cp313-win32.whl", hash = "sha256:dc7c389dce432500273eaf48f410b37886be9208b2dd5710aaf7c57fd442c698"}, + {file = "coverage-7.10.7-cp313-cp313-win_amd64.whl", hash = "sha256:cac0fdca17b036af3881a9d2729a850b76553f3f716ccb0360ad4dbc06b3b843"}, + {file = "coverage-7.10.7-cp313-cp313-win_arm64.whl", hash = "sha256:4b6f236edf6e2f9ae8fcd1332da4e791c1b6ba0dc16a2dc94590ceccb482e546"}, + {file = "coverage-7.10.7-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a0ec07fd264d0745ee396b666d47cef20875f4ff2375d7c4f58235886cc1ef0c"}, + {file = "coverage-7.10.7-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd5e856ebb7bfb7672b0086846db5afb4567a7b9714b8a0ebafd211ec7ce6a15"}, + {file = "coverage-7.10.7-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f57b2a3c8353d3e04acf75b3fed57ba41f5c0646bbf1d10c7c282291c97936b4"}, + {file = "coverage-7.10.7-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1ef2319dd15a0b009667301a3f84452a4dc6fddfd06b0c5c53ea472d3989fbf0"}, + {file = "coverage-7.10.7-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83082a57783239717ceb0ad584de3c69cf581b2a95ed6bf81ea66034f00401c0"}, + {file = "coverage-7.10.7-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:50aa94fb1fb9a397eaa19c0d5ec15a5edd03a47bf1a3a6111a16b36e190cff65"}, + {file = "coverage-7.10.7-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2120043f147bebb41c85b97ac45dd173595ff14f2a584f2963891cbcc3091541"}, + {file = "coverage-7.10.7-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2fafd773231dd0378fdba66d339f84904a8e57a262f583530f4f156ab83863e6"}, + {file = "coverage-7.10.7-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:0b944ee8459f515f28b851728ad224fa2d068f1513ef6b7ff1efafeb2185f999"}, + {file = "coverage-7.10.7-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4b583b97ab2e3efe1b3e75248a9b333bd3f8b0b1b8e5b45578e05e5850dfb2c2"}, + {file = "coverage-7.10.7-cp313-cp313t-win32.whl", hash = "sha256:2a78cd46550081a7909b3329e2266204d584866e8d97b898cd7fb5ac8d888b1a"}, + {file = "coverage-7.10.7-cp313-cp313t-win_amd64.whl", hash = "sha256:33a5e6396ab684cb43dc7befa386258acb2d7fae7f67330ebb85ba4ea27938eb"}, + {file = "coverage-7.10.7-cp313-cp313t-win_arm64.whl", hash = "sha256:86b0e7308289ddde73d863b7683f596d8d21c7d8664ce1dee061d0bcf3fbb4bb"}, + {file = "coverage-7.10.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b06f260b16ead11643a5a9f955bd4b5fd76c1a4c6796aeade8520095b75de520"}, + {file = "coverage-7.10.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:212f8f2e0612778f09c55dd4872cb1f64a1f2b074393d139278ce902064d5b32"}, + {file = "coverage-7.10.7-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3445258bcded7d4aa630ab8296dea4d3f15a255588dd535f980c193ab6b95f3f"}, + {file = "coverage-7.10.7-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb45474711ba385c46a0bfe696c695a929ae69ac636cda8f532be9e8c93d720a"}, + {file = "coverage-7.10.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:813922f35bd800dca9994c5971883cbc0d291128a5de6b167c7aa697fcf59360"}, + {file = "coverage-7.10.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:93c1b03552081b2a4423091d6fb3787265b8f86af404cff98d1b5342713bdd69"}, + {file = "coverage-7.10.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:cc87dd1b6eaf0b848eebb1c86469b9f72a1891cb42ac7adcfbce75eadb13dd14"}, + {file = "coverage-7.10.7-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:39508ffda4f343c35f3236fe8d1a6634a51f4581226a1262769d7f970e73bffe"}, + {file = "coverage-7.10.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:925a1edf3d810537c5a3abe78ec5530160c5f9a26b1f4270b40e62cc79304a1e"}, + {file = "coverage-7.10.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2c8b9a0636f94c43cd3576811e05b89aa9bc2d0a85137affc544ae5cb0e4bfbd"}, + {file = "coverage-7.10.7-cp314-cp314-win32.whl", hash = "sha256:b7b8288eb7cdd268b0304632da8cb0bb93fadcfec2fe5712f7b9cc8f4d487be2"}, + {file = "coverage-7.10.7-cp314-cp314-win_amd64.whl", hash = "sha256:1ca6db7c8807fb9e755d0379ccc39017ce0a84dcd26d14b5a03b78563776f681"}, + {file = "coverage-7.10.7-cp314-cp314-win_arm64.whl", hash = "sha256:097c1591f5af4496226d5783d036bf6fd6cd0cbc132e071b33861de756efb880"}, + {file = "coverage-7.10.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:a62c6ef0d50e6de320c270ff91d9dd0a05e7250cac2a800b7784bae474506e63"}, + {file = "coverage-7.10.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9fa6e4dd51fe15d8738708a973470f67a855ca50002294852e9571cdbd9433f2"}, + {file = "coverage-7.10.7-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8fb190658865565c549b6b4706856d6a7b09302c797eb2cf8e7fe9dabb043f0d"}, + {file = "coverage-7.10.7-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:affef7c76a9ef259187ef31599a9260330e0335a3011732c4b9effa01e1cd6e0"}, + {file = "coverage-7.10.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e16e07d85ca0cf8bafe5f5d23a0b850064e8e945d5677492b06bbe6f09cc699"}, + {file = "coverage-7.10.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:03ffc58aacdf65d2a82bbeb1ffe4d01ead4017a21bfd0454983b88ca73af94b9"}, + {file = "coverage-7.10.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1b4fd784344d4e52647fd7857b2af5b3fbe6c239b0b5fa63e94eb67320770e0f"}, + {file = "coverage-7.10.7-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:0ebbaddb2c19b71912c6f2518e791aa8b9f054985a0769bdb3a53ebbc765c6a1"}, + {file = "coverage-7.10.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a2d9a3b260cc1d1dbdb1c582e63ddcf5363426a1a68faa0f5da28d8ee3c722a0"}, + {file = "coverage-7.10.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a3cc8638b2480865eaa3926d192e64ce6c51e3d29c849e09d5b4ad95efae5399"}, + {file = "coverage-7.10.7-cp314-cp314t-win32.whl", hash = "sha256:67f8c5cbcd3deb7a60b3345dffc89a961a484ed0af1f6f73de91705cc6e31235"}, + {file = "coverage-7.10.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e1ed71194ef6dea7ed2d5cb5f7243d4bcd334bfb63e59878519be558078f848d"}, + {file = "coverage-7.10.7-cp314-cp314t-win_arm64.whl", hash = "sha256:7fe650342addd8524ca63d77b2362b02345e5f1a093266787d210c70a50b471a"}, + {file = "coverage-7.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fff7b9c3f19957020cac546c70025331113d2e61537f6e2441bc7657913de7d3"}, + {file = "coverage-7.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bc91b314cef27742da486d6839b677b3f2793dfe52b51bbbb7cf736d5c29281c"}, + {file = "coverage-7.10.7-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:567f5c155eda8df1d3d439d40a45a6a5f029b429b06648235f1e7e51b522b396"}, + {file = "coverage-7.10.7-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2af88deffcc8a4d5974cf2d502251bc3b2db8461f0b66d80a449c33757aa9f40"}, + {file = "coverage-7.10.7-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7315339eae3b24c2d2fa1ed7d7a38654cba34a13ef19fbcb9425da46d3dc594"}, + {file = "coverage-7.10.7-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:912e6ebc7a6e4adfdbb1aec371ad04c68854cd3bf3608b3514e7ff9062931d8a"}, + {file = "coverage-7.10.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f49a05acd3dfe1ce9715b657e28d138578bc40126760efb962322c56e9ca344b"}, + {file = "coverage-7.10.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:cce2109b6219f22ece99db7644b9622f54a4e915dad65660ec435e89a3ea7cc3"}, + {file = "coverage-7.10.7-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:f3c887f96407cea3916294046fc7dab611c2552beadbed4ea901cbc6a40cc7a0"}, + {file = "coverage-7.10.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:635adb9a4507c9fd2ed65f39693fa31c9a3ee3a8e6dc64df033e8fdf52a7003f"}, + {file = "coverage-7.10.7-cp39-cp39-win32.whl", hash = "sha256:5a02d5a850e2979b0a014c412573953995174743a3f7fa4ea5a6e9a3c5617431"}, + {file = "coverage-7.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:c134869d5ffe34547d14e174c866fd8fe2254918cc0a95e99052903bc1543e07"}, + {file = "coverage-7.10.7-py3-none-any.whl", hash = "sha256:f7941f6f2fe6dd6807a1208737b8a0cbcf1cc6d7b07d24998ad2d63590868260"}, + {file = "coverage-7.10.7.tar.gz", hash = "sha256:f4ab143ab113be368a3e9b795f9cd7906c5ef407d6173fe9675a902e1fffc239"}, ] [package.dependencies] -cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] -nox = ["nox"] -pep8test = ["check-sdist", "click", "mypy", "ruff"] -sdist = ["build"] -ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] -test-randomorder = ["pytest-randomly"] +toml = ["tomli"] [[package]] name = "distlib" -version = "0.3.6" +version = "0.4.0" description = "Distribution utilities" optional = false python-versions = "*" files = [ - {file = "distlib-0.3.6-py2.py3-none-any.whl", hash = "sha256:f35c4b692542ca110de7ef0bea44d73981caeb34ca0b9b6b2e6d7790dda8f80e"}, - {file = "distlib-0.3.6.tar.gz", hash = "sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46"}, + {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"}, + {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, ] [[package]] -name = "docutils" -version = "0.19" -description = "Docutils -- Python Documentation Utilities" +name = "exceptiongroup" +version = "1.3.1" +description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "docutils-0.19-py3-none-any.whl", hash = "sha256:5e1de4d849fee02c63b040a4a3fd567f4ab104defd8a5511fbbc24a8a017efbc"}, - {file = "docutils-0.19.tar.gz", hash = "sha256:33995a6753c30b7f577febfc2c50411fec6aac7f7ffeb7c4cfe5991072dcf9e6"}, -] - -[[package]] -name = "dparse" -version = "0.6.0" -description = "A parser for Python dependency files" -optional = false -python-versions = ">=3.5" -files = [ - {file = "dparse-0.6.0-py3-none-any.whl", hash = "sha256:3cb489bd06bfa8d285c85f7dec69d9ee8f89c29dd5f4ab48e159746dc13b78b2"}, - {file = "dparse-0.6.0.tar.gz", hash = "sha256:57068bb61859b1676c6beb10f399906eecb41a75b5d3fbc99d0311059cb67213"}, + {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, + {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, ] [package.dependencies] -packaging = "*" -toml = "*" +typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} [package.extras] -conda = ["pyyaml"] -pipenv = ["pipenv"] +test = ["pytest (>=6)"] [[package]] name = "filelock" -version = "3.8.0" +version = "3.19.1" description = "A platform independent file lock." optional = false -python-versions = ">=3.7" -files = [ - {file = "filelock-3.8.0-py3-none-any.whl", hash = "sha256:617eb4e5eedc82fc5f47b6d61e4d11cb837c56cb4544e39081099fa17ad109d4"}, - {file = "filelock-3.8.0.tar.gz", hash = "sha256:55447caa666f2198c5b6b13a26d2084d26fa5b115c00d065664b2124680c4edc"}, -] - -[package.extras] -docs = ["furo (>=2022.6.21)", "sphinx (>=5.1.1)", "sphinx-autodoc-typehints (>=1.19.1)"] -testing = ["covdefaults (>=2.2)", "coverage (>=6.4.2)", "pytest (>=7.1.2)", "pytest-cov (>=3)", "pytest-timeout (>=2.1)"] - -[[package]] -name = "flake8" -version = "3.9.2" -description = "the modular source code checker: pep8 pyflakes and co" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" -files = [ - {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, - {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, -] - -[package.dependencies] -mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.7.0,<2.8.0" -pyflakes = ">=2.3.0,<2.4.0" - -[[package]] -name = "flake8-docstrings" -version = "1.6.0" -description = "Extension for flake8 which uses pydocstyle to check docstrings" -optional = false -python-versions = "*" +python-versions = ">=3.9" files = [ - {file = "flake8-docstrings-1.6.0.tar.gz", hash = "sha256:9fe7c6a306064af8e62a055c2f61e9eb1da55f84bb39caef2b84ce53708ac34b"}, - {file = "flake8_docstrings-1.6.0-py2.py3-none-any.whl", hash = "sha256:99cac583d6c7e32dd28bbfbef120a7c0d1b6dde4adb5a9fd441c4227a6534bde"}, + {file = "filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d"}, + {file = "filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58"}, ] -[package.dependencies] -flake8 = ">=3" -pydocstyle = ">=2.1" - [[package]] -name = "identify" -version = "2.5.5" -description = "File identification library for Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "identify-2.5.5-py2.py3-none-any.whl", hash = "sha256:ef78c0d96098a3b5fe7720be4a97e73f439af7cf088ebf47b620aeaa10fadf97"}, - {file = "identify-2.5.5.tar.gz", hash = "sha256:322a5699daecf7c6fd60e68852f36f2ecbb6a36ff6e6e973e0d2bb6fca203ee6"}, -] - -[package.extras] -license = ["ukkonen"] - -[[package]] -name = "idna" -version = "3.4" -description = "Internationalized Domain Names in Applications (IDNA)" +name = "filelock" +version = "3.20.3" +description = "A platform independent file lock." optional = false -python-versions = ">=3.5" +python-versions = ">=3.10" files = [ - {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, - {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, + {file = "filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1"}, + {file = "filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1"}, ] [[package]] -name = "importlib-metadata" -version = "4.12.0" -description = "Read metadata from Python packages" +name = "googleapis-common-protos" +version = "1.72.0" +description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" files = [ - {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, - {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, + {file = "googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038"}, + {file = "googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5"}, ] [package.dependencies] -zipp = ">=0.5" +protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0" [package.extras] -docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] -perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - -[[package]] -name = "iniconfig" -version = "1.1.1" -description = "iniconfig: brain-dead simple config-ini parsing" -optional = false -python-versions = "*" -files = [ - {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, - {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, -] - -[[package]] -name = "jaraco.classes" -version = "3.2.2" -description = "Utility functions for Python class constructs" -optional = false -python-versions = ">=3.7" -files = [ - {file = "jaraco.classes-3.2.2-py3-none-any.whl", hash = "sha256:e6ef6fd3fcf4579a7a019d87d1e56a883f4e4c35cfe925f86731abc58804e647"}, - {file = "jaraco.classes-3.2.2.tar.gz", hash = "sha256:6745f113b0b588239ceb49532aa09c3ebb947433ce311ef2f8e3ad64ebb74594"}, +grpc = ["grpcio (>=1.44.0,<2.0.0)"] + +[[package]] +name = "grpcio" +version = "1.76.0" +description = "HTTP/2-based RPC framework" +optional = false +python-versions = ">=3.9" +files = [ + {file = "grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc"}, + {file = "grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde"}, + {file = "grpcio-1.76.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:035d90bc79eaa4bed83f524331d55e35820725c9fbb00ffa1904d5550ed7ede3"}, + {file = "grpcio-1.76.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4215d3a102bd95e2e11b5395c78562967959824156af11fa93d18fdd18050990"}, + {file = "grpcio-1.76.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49ce47231818806067aea3324d4bf13825b658ad662d3b25fada0bdad9b8a6af"}, + {file = "grpcio-1.76.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8cc3309d8e08fd79089e13ed4819d0af72aa935dd8f435a195fd152796752ff2"}, + {file = "grpcio-1.76.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:971fd5a1d6e62e00d945423a567e42eb1fa678ba89072832185ca836a94daaa6"}, + {file = "grpcio-1.76.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d9adda641db7207e800a7f089068f6f645959f2df27e870ee81d44701dd9db3"}, + {file = "grpcio-1.76.0-cp310-cp310-win32.whl", hash = "sha256:063065249d9e7e0782d03d2bca50787f53bd0fb89a67de9a7b521c4a01f1989b"}, + {file = "grpcio-1.76.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6ae758eb08088d36812dd5d9af7a9859c05b1e0f714470ea243694b49278e7b"}, + {file = "grpcio-1.76.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:2e1743fbd7f5fa713a1b0a8ac8ebabf0ec980b5d8809ec358d488e273b9cf02a"}, + {file = "grpcio-1.76.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a8c2cf1209497cf659a667d7dea88985e834c24b7c3b605e6254cbb5076d985c"}, + {file = "grpcio-1.76.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:08caea849a9d3c71a542827d6df9d5a69067b0a1efbea8a855633ff5d9571465"}, + {file = "grpcio-1.76.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f0e34c2079d47ae9f6188211db9e777c619a21d4faba6977774e8fa43b085e48"}, + {file = "grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8843114c0cfce61b40ad48df65abcfc00d4dba82eae8718fab5352390848c5da"}, + {file = "grpcio-1.76.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8eddfb4d203a237da6f3cc8a540dad0517d274b5a1e9e636fd8d2c79b5c1d397"}, + {file = "grpcio-1.76.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:32483fe2aab2c3794101c2a159070584e5db11d0aa091b2c0ea9c4fc43d0d749"}, + {file = "grpcio-1.76.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:dcfe41187da8992c5f40aa8c5ec086fa3672834d2be57a32384c08d5a05b4c00"}, + {file = "grpcio-1.76.0-cp311-cp311-win32.whl", hash = "sha256:2107b0c024d1b35f4083f11245c0e23846ae64d02f40b2b226684840260ed054"}, + {file = "grpcio-1.76.0-cp311-cp311-win_amd64.whl", hash = "sha256:522175aba7af9113c48ec10cc471b9b9bd4f6ceb36aeb4544a8e2c80ed9d252d"}, + {file = "grpcio-1.76.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:81fd9652b37b36f16138611c7e884eb82e0cec137c40d3ef7c3f9b3ed00f6ed8"}, + {file = "grpcio-1.76.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:04bbe1bfe3a68bbfd4e52402ab7d4eb59d72d02647ae2042204326cf4bbad280"}, + {file = "grpcio-1.76.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d388087771c837cdb6515539f43b9d4bf0b0f23593a24054ac16f7a960be16f4"}, + {file = "grpcio-1.76.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f8f757bebaaea112c00dba718fc0d3260052ce714e25804a03f93f5d1c6cc11"}, + {file = "grpcio-1.76.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:980a846182ce88c4f2f7e2c22c56aefd515daeb36149d1c897f83cf57999e0b6"}, + {file = "grpcio-1.76.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f92f88e6c033db65a5ae3d97905c8fea9c725b63e28d5a75cb73b49bda5024d8"}, + {file = "grpcio-1.76.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4baf3cbe2f0be3289eb68ac8ae771156971848bb8aaff60bad42005539431980"}, + {file = "grpcio-1.76.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:615ba64c208aaceb5ec83bfdce7728b80bfeb8be97562944836a7a0a9647d882"}, + {file = "grpcio-1.76.0-cp312-cp312-win32.whl", hash = "sha256:45d59a649a82df5718fd9527ce775fd66d1af35e6d31abdcdc906a49c6822958"}, + {file = "grpcio-1.76.0-cp312-cp312-win_amd64.whl", hash = "sha256:c088e7a90b6017307f423efbb9d1ba97a22aa2170876223f9709e9d1de0b5347"}, + {file = "grpcio-1.76.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:26ef06c73eb53267c2b319f43e6634c7556ea37672029241a056629af27c10e2"}, + {file = "grpcio-1.76.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:45e0111e73f43f735d70786557dc38141185072d7ff8dc1829d6a77ac1471468"}, + {file = "grpcio-1.76.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:83d57312a58dcfe2a3a0f9d1389b299438909a02db60e2f2ea2ae2d8034909d3"}, + {file = "grpcio-1.76.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3e2a27c89eb9ac3d81ec8835e12414d73536c6e620355d65102503064a4ed6eb"}, + {file = "grpcio-1.76.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:61f69297cba3950a524f61c7c8ee12e55c486cb5f7db47ff9dcee33da6f0d3ae"}, + {file = "grpcio-1.76.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6a15c17af8839b6801d554263c546c69c4d7718ad4321e3166175b37eaacca77"}, + {file = "grpcio-1.76.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:25a18e9810fbc7e7f03ec2516addc116a957f8cbb8cbc95ccc80faa072743d03"}, + {file = "grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42"}, + {file = "grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f"}, + {file = "grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8"}, + {file = "grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62"}, + {file = "grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd"}, + {file = "grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc"}, + {file = "grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a"}, + {file = "grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba"}, + {file = "grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09"}, + {file = "grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc"}, + {file = "grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc"}, + {file = "grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e"}, + {file = "grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e"}, + {file = "grpcio-1.76.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:8ebe63ee5f8fa4296b1b8cfc743f870d10e902ca18afc65c68cf46fd39bb0783"}, + {file = "grpcio-1.76.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:3bf0f392c0b806905ed174dcd8bdd5e418a40d5567a05615a030a5aeddea692d"}, + {file = "grpcio-1.76.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b7604868b38c1bfd5cf72d768aedd7db41d78cb6a4a18585e33fb0f9f2363fd"}, + {file = "grpcio-1.76.0-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:e6d1db20594d9daba22f90da738b1a0441a7427552cc6e2e3d1297aeddc00378"}, + {file = "grpcio-1.76.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d099566accf23d21037f18a2a63d323075bebace807742e4b0ac210971d4dd70"}, + {file = "grpcio-1.76.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ebea5cc3aa8ea72e04df9913492f9a96d9348db876f9dda3ad729cfedf7ac416"}, + {file = "grpcio-1.76.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0c37db8606c258e2ee0c56b78c62fc9dee0e901b5dbdcf816c2dd4ad652b8b0c"}, + {file = "grpcio-1.76.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ebebf83299b0cb1721a8859ea98f3a77811e35dce7609c5c963b9ad90728f886"}, + {file = "grpcio-1.76.0-cp39-cp39-win32.whl", hash = "sha256:0aaa82d0813fd4c8e589fac9b65d7dd88702555f702fb10417f96e2a2a6d4c0f"}, + {file = "grpcio-1.76.0-cp39-cp39-win_amd64.whl", hash = "sha256:acab0277c40eff7143c2323190ea57b9ee5fd353d8190ee9652369fae735668a"}, + {file = "grpcio-1.76.0.tar.gz", hash = "sha256:7be78388d6da1a25c0d5ec506523db58b18be22d9c37d8d3a32c08be4987bd73"}, ] [package.dependencies] -more-itertools = "*" +typing-extensions = ">=4.12,<5.0" [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +protobuf = ["grpcio-tools (>=1.76.0)"] [[package]] -name = "jeepney" -version = "0.8.0" -description = "Low-level, pure Python DBus protocol wrapper." +name = "grpcio-status" +version = "1.76.0" +description = "Status proto mapping for gRPC" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, - {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, + {file = "grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18"}, + {file = "grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd"}, ] -[package.extras] -test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] -trio = ["async_generator", "trio"] +[package.dependencies] +googleapis-common-protos = ">=1.5.5" +grpcio = ">=1.76.0" +protobuf = ">=6.31.1,<7.0.0" [[package]] -name = "keyring" -version = "23.9.1" -description = "Store and access your passwords safely." +name = "identify" +version = "2.6.15" +description = "File identification library for Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "keyring-23.9.1-py3-none-any.whl", hash = "sha256:3565b9e4ea004c96e158d2d332a49f466733d565bb24157a60fd2e49f41a0fd1"}, - {file = "keyring-23.9.1.tar.gz", hash = "sha256:39e4f6572238d2615a82fcaa485e608b84b503cf080dc924c43bbbacb11c1c18"}, + {file = "identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757"}, + {file = "identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf"}, ] -[package.dependencies] -importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""} -"jaraco.classes" = "*" -jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} -pywin32-ctypes = {version = "<0.1.0 || >0.1.0,<0.1.1 || >0.1.1", markers = "sys_platform == \"win32\""} -SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} - [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[[package]] -name = "mccabe" -version = "0.6.1" -description = "McCabe checker, plugin for flake8" -optional = false -python-versions = "*" -files = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, -] +license = ["ukkonen"] [[package]] -name = "more-itertools" -version = "8.14.0" -description = "More routines for operating on iterables, beyond itertools" +name = "iniconfig" +version = "2.1.0" +description = "brain-dead simple config-ini parsing" optional = false -python-versions = ">=3.5" +python-versions = ">=3.8" files = [ - {file = "more-itertools-8.14.0.tar.gz", hash = "sha256:c09443cd3d5438b8dafccd867a6bc1cb0894389e90cb53d227456b0b0bccb750"}, - {file = "more_itertools-8.14.0-py3-none-any.whl", hash = "sha256:1bc4f91ee5b1b31ac7ceacc17c09befe6a40a503907baf9c839c229b5095cfd2"}, + {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, + {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] [[package]] name = "mypy-extensions" -version = "0.4.3" -description = "Experimental type system extensions for programs checked with the mypy typechecker." +version = "1.1.0" +description = "Type system extensions for programs checked with the mypy type checker." optional = false -python-versions = "*" +python-versions = ">=3.8" files = [ - {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, - {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, + {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, + {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, ] [[package]] name = "nodeenv" -version = "1.7.0" +version = "1.10.0" description = "Node.js virtual environment builder" optional = false -python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ - {file = "nodeenv-1.7.0-py2.py3-none-any.whl", hash = "sha256:27083a7b96a25f2f5e1d8cb4b6317ee8aeda3bdd121394e5ac54e498028a042e"}, - {file = "nodeenv-1.7.0.tar.gz", hash = "sha256:e0e7f7dfb85fc5394c6fe1e8fa98131a2473e04311a45afb6508f7cf1836fa2b"}, + {file = "nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827"}, + {file = "nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb"}, ] -[package.dependencies] -setuptools = "*" - [[package]] name = "numpy" -version = "1.23.3" -description = "NumPy is the fundamental package for array computing with Python." -optional = false -python-versions = ">=3.8" -files = [ - {file = "numpy-1.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c9f707b5bb73bf277d812ded9896f9512a43edff72712f31667d0a8c2f8e71ee"}, - {file = "numpy-1.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffcf105ecdd9396e05a8e58e81faaaf34d3f9875f137c7372450baa5d77c9a54"}, - {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ea3f98a0ffce3f8f57675eb9119f3f4edb81888b6874bc1953f91e0b1d4f440"}, - {file = "numpy-1.23.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004f0efcb2fe1c0bd6ae1fcfc69cc8b6bf2407e0f18be308612007a0762b4089"}, - {file = "numpy-1.23.3-cp310-cp310-win32.whl", hash = "sha256:98dcbc02e39b1658dc4b4508442a560fe3ca5ca0d989f0df062534e5ca3a5c1a"}, - {file = "numpy-1.23.3-cp310-cp310-win_amd64.whl", hash = "sha256:39a664e3d26ea854211867d20ebcc8023257c1800ae89773cbba9f9e97bae036"}, - {file = "numpy-1.23.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1f27b5322ac4067e67c8f9378b41c746d8feac8bdd0e0ffede5324667b8a075c"}, - {file = "numpy-1.23.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ad3ec9a748a8943e6eb4358201f7e1c12ede35f510b1a2221b70af4bb64295c"}, - {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdc9febce3e68b697d931941b263c59e0c74e8f18861f4064c1f712562903411"}, - {file = "numpy-1.23.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:301c00cf5e60e08e04d842fc47df641d4a181e651c7135c50dc2762ffe293dbd"}, - {file = "numpy-1.23.3-cp311-cp311-win32.whl", hash = "sha256:7cd1328e5bdf0dee621912f5833648e2daca72e3839ec1d6695e91089625f0b4"}, - {file = "numpy-1.23.3-cp311-cp311-win_amd64.whl", hash = "sha256:8355fc10fd33a5a70981a5b8a0de51d10af3688d7a9e4a34fcc8fa0d7467bb7f"}, - {file = "numpy-1.23.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc6e8da415f359b578b00bcfb1d08411c96e9a97f9e6c7adada554a0812a6cc6"}, - {file = "numpy-1.23.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:22d43376ee0acd547f3149b9ec12eec2f0ca4a6ab2f61753c5b29bb3e795ac4d"}, - {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a64403f634e5ffdcd85e0b12c08f04b3080d3e840aef118721021f9b48fc1460"}, - {file = "numpy-1.23.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efd9d3abe5774404becdb0748178b48a218f1d8c44e0375475732211ea47c67e"}, - {file = "numpy-1.23.3-cp38-cp38-win32.whl", hash = "sha256:f8c02ec3c4c4fcb718fdf89a6c6f709b14949408e8cf2a2be5bfa9c49548fd85"}, - {file = "numpy-1.23.3-cp38-cp38-win_amd64.whl", hash = "sha256:e868b0389c5ccfc092031a861d4e158ea164d8b7fdbb10e3b5689b4fc6498df6"}, - {file = "numpy-1.23.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:09f6b7bdffe57fc61d869a22f506049825d707b288039d30f26a0d0d8ea05164"}, - {file = "numpy-1.23.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8c79d7cf86d049d0c5089231a5bcd31edb03555bd93d81a16870aa98c6cfb79d"}, - {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d5420053bbb3dd64c30e58f9363d7a9c27444c3648e61460c1237f9ec3fa14"}, - {file = "numpy-1.23.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5422d6a1ea9b15577a9432e26608c73a78faf0b9039437b075cf322c92e98e7"}, - {file = "numpy-1.23.3-cp39-cp39-win32.whl", hash = "sha256:c1ba66c48b19cc9c2975c0d354f24058888cdc674bebadceb3cdc9ec403fb5d1"}, - {file = "numpy-1.23.3-cp39-cp39-win_amd64.whl", hash = "sha256:78a63d2df1d947bd9d1b11d35564c2f9e4b57898aae4626638056ec1a231c40c"}, - {file = "numpy-1.23.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:17c0e467ade9bda685d5ac7f5fa729d8d3e76b23195471adae2d6a6941bd2c18"}, - {file = "numpy-1.23.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91b8d6768a75247026e951dce3b2aac79dc7e78622fc148329135ba189813584"}, - {file = "numpy-1.23.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:94c15ca4e52671a59219146ff584488907b1f9b3fc232622b47e2cf832e94fb8"}, - {file = "numpy-1.23.3.tar.gz", hash = "sha256:51bf49c0cd1d52be0a240aa66f3458afc4b95d8993d2d04f0d91fa60c10af6cd"}, +version = "2.0.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"}, + {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"}, + {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"}, + {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"}, + {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"}, + {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"}, + {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"}, + {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"}, + {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"}, + {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, ] [[package]] name = "packaging" -version = "21.3" +version = "25.0" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, - {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, + {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, + {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, ] -[package.dependencies] -pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" - [[package]] name = "pandas" -version = "1.4.4" +version = "2.3.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false -python-versions = ">=3.8" -files = [ - {file = "pandas-1.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:799e6a25932df7e6b1f8dabf63de064e2205dc309abb75956126a0453fd88e97"}, - {file = "pandas-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7cd1d69a387f7d5e1a5a06a87574d9ef2433847c0e78113ab51c84d3a8bcaeaa"}, - {file = "pandas-1.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:94f2ed1fd51e545ebf71da1e942fe1822ee01e10d3dd2a7276d01351333b7c6b"}, - {file = "pandas-1.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4591cadd06fbbbd16fafc2de6e840c1aaefeae3d5864b688004777ef1bbdede3"}, - {file = "pandas-1.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0022fe6a313df1c4869b5edc012d734c6519a6fffa3cf70930f32e6a1078e49"}, - {file = "pandas-1.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:785e878a6e6d8ddcdb8c181e600855402750052497d7fc6d6b508894f6b8830b"}, - {file = "pandas-1.4.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c4bb8b0ab9f94207d07e401d24baebfc63057246b1a5e0cd9ee50df85a656871"}, - {file = "pandas-1.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:51c424ca134fdaeac9a4acd719d1ab48046afc60943a489028f0413fdbe9ef1c"}, - {file = "pandas-1.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ce35f947202b0b99c660221d82beb91d2e6d553d55a40b30128204e3e2c63848"}, - {file = "pandas-1.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6f1848148ed3204235967613b0a32be2d77f214e9623f554511047705c1e04"}, - {file = "pandas-1.4.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7cc960959be28d064faefc0cb2aef854d46b827c004ebea7e79b5497ed83e7d"}, - {file = "pandas-1.4.4-cp38-cp38-win32.whl", hash = "sha256:9d805bce209714b1c1fa29bfb1e42ad87e4c0a825e4b390c56a3e71593b7e8d8"}, - {file = "pandas-1.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:afbddad78a98ec4d2ce08b384b81730de1ccc975b99eb663e6dac43703f36d98"}, - {file = "pandas-1.4.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a08ceb59db499864c58a9bf85ab6219d527d91f14c0240cc25fa2c261032b2a7"}, - {file = "pandas-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0959c41004e3d2d16f39c828d6da66ebee329836a7ecee49fb777ac9ad8a7501"}, - {file = "pandas-1.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:87b4194f344dcd14c0f885cecb22005329b38bda10f1aaf7b9596a00ec8a4768"}, - {file = "pandas-1.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d2a7a3c1fea668d56bd91edbd5f2732e0af8feb9d2bf8d9bfacb2dea5fa9536"}, - {file = "pandas-1.4.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a981cfabf51c318a562deb4ae7deec594c07aee7cf18b4594a92c23718ec8275"}, - {file = "pandas-1.4.4-cp39-cp39-win32.whl", hash = "sha256:050aada67a5ec6699a7879e769825b510018a95fb9ac462bb1867483d0974a97"}, - {file = "pandas-1.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:8d4d2fe2863ecddb0ba1979bdda26c8bc2ea138f5a979abe3ba80c0fa4015c91"}, - {file = "pandas-1.4.4.tar.gz", hash = "sha256:ab6c0d738617b675183e5f28db32b5148b694ad9bba0a40c3ea26d96b431db67"}, +python-versions = ">=3.9" +files = [ + {file = "pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c"}, + {file = "pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a"}, + {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1"}, + {file = "pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838"}, + {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250"}, + {file = "pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4"}, + {file = "pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826"}, + {file = "pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523"}, + {file = "pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45"}, + {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66"}, + {file = "pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b"}, + {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791"}, + {file = "pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151"}, + {file = "pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c"}, + {file = "pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53"}, + {file = "pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35"}, + {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908"}, + {file = "pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89"}, + {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98"}, + {file = "pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084"}, + {file = "pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b"}, + {file = "pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713"}, + {file = "pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8"}, + {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d"}, + {file = "pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac"}, + {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c"}, + {file = "pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493"}, + {file = "pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee"}, + {file = "pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5"}, + {file = "pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21"}, + {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78"}, + {file = "pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110"}, + {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86"}, + {file = "pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc"}, + {file = "pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0"}, + {file = "pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593"}, + {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c"}, + {file = "pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b"}, + {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6"}, + {file = "pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3"}, + {file = "pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5"}, + {file = "pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec"}, + {file = "pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7"}, + {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450"}, + {file = "pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5"}, + {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788"}, + {file = "pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87"}, + {file = "pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2"}, + {file = "pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8"}, + {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff"}, + {file = "pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29"}, + {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73"}, + {file = "pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9"}, + {file = "pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa"}, + {file = "pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b"}, ] [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, - {version = ">=1.18.5", markers = "(platform_machine != \"aarch64\" and platform_machine != \"arm64\") and python_version < \"3.10\""}, - {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, - {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] -python-dateutil = ">=2.8.1" +python-dateutil = ">=2.8.2" pytz = ">=2020.1" +tzdata = ">=2022.7" [package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "pathspec" -version = "0.10.1" +version = "1.0.3" description = "Utility library for gitignore style pattern matching of file paths." optional = false -python-versions = ">=3.7" -files = [ - {file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"}, - {file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"}, -] - -[[package]] -name = "pkginfo" -version = "1.8.3" -description = "Query metadatdata from sdists / bdists / installed packages." -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.9" files = [ - {file = "pkginfo-1.8.3-py2.py3-none-any.whl", hash = "sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594"}, - {file = "pkginfo-1.8.3.tar.gz", hash = "sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c"}, + {file = "pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c"}, + {file = "pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d"}, ] [package.extras] -testing = ["coverage", "nose"] +hyperscan = ["hyperscan (>=0.7)"] +optional = ["typing-extensions (>=4)"] +re2 = ["google-re2 (>=1.1)"] +tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] [[package]] name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +version = "4.4.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, - {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, + {file = "platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85"}, + {file = "platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf"}, ] [package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] -test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.14.1)"] [[package]] name = "pluggy" -version = "1.0.0" +version = "1.6.0" description = "plugin and hook calling mechanisms for python" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] [package.extras] dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["coverage", "pytest", "pytest-benchmark"] [[package]] name = "pre-commit" -version = "2.20.0" +version = "3.8.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "pre_commit-2.20.0-py2.py3-none-any.whl", hash = "sha256:51a5ba7c480ae8072ecdb6933df22d2f812dc897d5fe848778116129a681aac7"}, - {file = "pre_commit-2.20.0.tar.gz", hash = "sha256:a978dac7bc9ec0bcee55c18a277d553b0f419d259dadb4b9418ff2d00eb43959"}, + {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"}, + {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"}, ] [package.dependencies] @@ -739,225 +635,198 @@ cfgv = ">=2.0.0" identify = ">=1.0.0" nodeenv = ">=0.11.1" pyyaml = ">=5.1" -toml = "*" -virtualenv = ">=20.0.8" +virtualenv = ">=20.10.0" [[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" +name = "protobuf" +version = "6.33.4" +description = "" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.9" files = [ - {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, - {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, + {file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"}, + {file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"}, + {file = "protobuf-6.33.4-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:2fe67f6c014c84f655ee06f6f66213f9254b3a8b6bda6cda0ccd4232c73c06f0"}, + {file = "protobuf-6.33.4-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:757c978f82e74d75cba88eddec479df9b99a42b31193313b75e492c06a51764e"}, + {file = "protobuf-6.33.4-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c7c64f259c618f0bef7bee042075e390debbf9682334be2b67408ec7c1c09ee6"}, + {file = "protobuf-6.33.4-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:3df850c2f8db9934de4cf8f9152f8dc2558f49f298f37f90c517e8e5c84c30e9"}, + {file = "protobuf-6.33.4-cp39-cp39-win32.whl", hash = "sha256:955478a89559fa4568f5a81dce77260eabc5c686f9e8366219ebd30debf06aa6"}, + {file = "protobuf-6.33.4-cp39-cp39-win_amd64.whl", hash = "sha256:0f12ddbf96912690c3582f9dffb55530ef32015ad8e678cd494312bd78314c4f"}, + {file = "protobuf-6.33.4-py3-none-any.whl", hash = "sha256:1fe3730068fcf2e595816a6c34fe66eeedd37d51d0400b72fabc848811fdc1bc"}, + {file = "protobuf-6.33.4.tar.gz", hash = "sha256:dc2e61bca3b10470c1912d166fe0af67bfc20eb55971dcef8dfa48ce14f0ed91"}, ] [[package]] name = "py4j" -version = "0.10.9.5" +version = "0.10.9.7" description = "Enables Python programs to dynamically access arbitrary Java objects" -optional = true -python-versions = "*" -files = [ - {file = "py4j-0.10.9.5-py2.py3-none-any.whl", hash = "sha256:52d171a6a2b031d8a5d1de6efe451cf4f5baff1a2819aabc3741c8406539ba04"}, - {file = "py4j-0.10.9.5.tar.gz", hash = "sha256:276a4a3c5a2154df1860ef3303a927460e02e97b047dc0a47c1c3fb8cce34db6"}, -] - -[[package]] -name = "pycodestyle" -version = "2.7.0" -description = "Python style guide checker" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, - {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, -] - -[[package]] -name = "pycparser" -version = "2.21" -description = "C parser in Python" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, -] - -[[package]] -name = "pydocstyle" -version = "6.1.1" -description = "Python docstring style checker" optional = false -python-versions = ">=3.6" +python-versions = "*" files = [ - {file = "pydocstyle-6.1.1-py3-none-any.whl", hash = "sha256:6987826d6775056839940041beef5c08cc7e3d71d63149b48e36727f70144dc4"}, - {file = "pydocstyle-6.1.1.tar.gz", hash = "sha256:1d41b7c459ba0ee6c345f2eb9ae827cab14a7533a88c5c6f7e94923f72df92dc"}, + {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, + {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, +] + +[[package]] +name = "pyarrow" +version = "21.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26"}, + {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79"}, + {file = "pyarrow-21.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:7be45519b830f7c24b21d630a31d48bcebfd5d4d7f9d3bdb49da9cdf6d764edb"}, + {file = "pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26bfd95f6bff443ceae63c65dc7e048670b7e98bc892210acba7e4995d3d4b51"}, + {file = "pyarrow-21.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bd04ec08f7f8bd113c55868bd3fc442a9db67c27af098c5f814a3091e71cc61a"}, + {file = "pyarrow-21.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9b0b14b49ac10654332a805aedfc0147fb3469cbf8ea951b3d040dab12372594"}, + {file = "pyarrow-21.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9d9f8bcb4c3be7738add259738abdeddc363de1b80e3310e04067aa1ca596634"}, + {file = "pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b"}, + {file = "pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10"}, + {file = "pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e"}, + {file = "pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569"}, + {file = "pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e"}, + {file = "pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c"}, + {file = "pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6"}, + {file = "pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd"}, + {file = "pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876"}, + {file = "pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d"}, + {file = "pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e"}, + {file = "pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82"}, + {file = "pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623"}, + {file = "pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18"}, + {file = "pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a"}, + {file = "pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe"}, + {file = "pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd"}, + {file = "pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61"}, + {file = "pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d"}, + {file = "pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99"}, + {file = "pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636"}, + {file = "pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da"}, + {file = "pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7"}, + {file = "pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6"}, + {file = "pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8"}, + {file = "pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503"}, + {file = "pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79"}, + {file = "pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10"}, + {file = "pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3"}, + {file = "pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1"}, + {file = "pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d"}, + {file = "pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e"}, + {file = "pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4"}, + {file = "pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7"}, + {file = "pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f"}, + {file = "pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc"}, ] -[package.dependencies] -snowballstemmer = "*" - [package.extras] -toml = ["toml"] - -[[package]] -name = "pyflakes" -version = "2.3.1" -description = "passive checker of Python programs" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, - {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, -] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] [[package]] name = "pygments" -version = "2.13.0" +version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false -python-versions = ">=3.6" -files = [ - {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"}, - {file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"}, -] - -[package.extras] -plugins = ["importlib-metadata"] - -[[package]] -name = "pyparsing" -version = "3.0.9" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false -python-versions = ">=3.6.8" +python-versions = ">=3.8" files = [ - {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, - {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, + {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, + {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, ] [package.extras] -diagrams = ["jinja2", "railroad-diagrams"] +windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pyspark" -version = "3.2.2" +version = "3.5.0" description = "Apache Spark Python API" -optional = true -python-versions = ">=3.6" +optional = false +python-versions = ">=3.8" files = [ - {file = "pyspark-3.2.2.tar.gz", hash = "sha256:5455214cf0b83d4a184cda25ca3b0812481915353b180cf7d7ac227728a4d99e"}, + {file = "pyspark-3.5.0.tar.gz", hash = "sha256:d41a9b76bd2aca370a6100d075c029e22ba44c5940927877e9435a3a9c566558"}, ] [package.dependencies] -py4j = "0.10.9.5" +googleapis-common-protos = {version = ">=1.56.4", optional = true, markers = "extra == \"connect\""} +grpcio = {version = ">=1.56.0", optional = true, markers = "extra == \"connect\""} +grpcio-status = {version = ">=1.56.0", optional = true, markers = "extra == \"connect\""} +numpy = {version = ">=1.15", optional = true, markers = "extra == \"connect\""} +pandas = {version = ">=1.0.5", optional = true, markers = "extra == \"connect\""} +py4j = "0.10.9.7" +pyarrow = {version = ">=4.0.0", optional = true, markers = "extra == \"connect\""} [package.extras] -ml = ["numpy (>=1.7)"] -mllib = ["numpy (>=1.7)"] -pandas-on-spark = ["numpy (>=1.14)", "pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] -sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] +connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.56.0)", "grpcio-status (>=1.56.0)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +ml = ["numpy (>=1.15)"] +mllib = ["numpy (>=1.15)"] +pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] +sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=4.0.0)"] [[package]] name = "pytest" -version = "6.2.5" +version = "8.4.2" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" files = [ - {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, - {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, + {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, + {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, ] [package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} -attrs = ">=19.2.0" -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" -py = ">=1.8.2" -toml = "*" +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""} +iniconfig = ">=1" +packaging = ">=20" +pluggy = ">=1.5,<2" +pygments = ">=2.7.2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-cov" -version = "2.12.1" +version = "4.1.0" description = "Pytest plugin for measuring coverage." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7" files = [ - {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"}, - {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"}, + {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, + {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, ] [package.dependencies] -coverage = ">=5.2.1" +coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" -toml = "*" [package.extras] testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] -[[package]] -name = "pytest-flake8" -version = "1.1.0" -description = "pytest plugin to check FLAKE8 requirements" -optional = false -python-versions = "*" -files = [ - {file = "pytest-flake8-1.1.0.tar.gz", hash = "sha256:358d449ca06b80dbadcb43506cd3e38685d273b4968ac825da871bd4cc436202"}, - {file = "pytest_flake8-1.1.0-py2.py3-none-any.whl", hash = "sha256:f1b19dad0b9f0aa651d391c9527ebc20ac1a0f847aa78581094c747462bfa182"}, -] - -[package.dependencies] -flake8 = ">=3.5" -pytest = ">=3.5" - [[package]] name = "pytest-rerunfailures" -version = "9.1.1" +version = "14.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false -python-versions = ">=3.5" +python-versions = ">=3.8" files = [ - {file = "pytest-rerunfailures-9.1.1.tar.gz", hash = "sha256:1cb11a17fc121b3918414eb5eaf314ee325f2e693ac7cb3f6abf7560790827f2"}, - {file = "pytest_rerunfailures-9.1.1-py3-none-any.whl", hash = "sha256:2eb7d0ad651761fbe80e064b0fd415cf6730cdbc53c16a145fd84b66143e609f"}, + {file = "pytest-rerunfailures-14.0.tar.gz", hash = "sha256:4a400bcbcd3c7a4ad151ab8afac123d90eca3abe27f98725dc4d9702887d2e92"}, + {file = "pytest_rerunfailures-14.0-py3-none-any.whl", hash = "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32"}, ] [package.dependencies] -pytest = ">=5.0" -setuptools = ">=40.0" - -[[package]] -name = "pytest-runner" -version = "5.3.2" -description = "Invoke py.test as distutils command with dependency resolution" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-runner-5.3.2.tar.gz", hash = "sha256:48934ec94301f6727d30615af1960539ff62063f6c9b71b7227174e51ba5fb34"}, - {file = "pytest_runner-5.3.2-py3-none-any.whl", hash = "sha256:c7d785ea6c612396c11ddbaf467764d2cc746ef96a713fbe1a296c221503b7c3"}, -] - -[package.extras] -docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"] -testing = ["pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy", "pytest-virtualenv"] +packaging = ">=17.1" +pytest = ">=7.2" [[package]] name = "python-dateutil" -version = "2.8.2" +version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] [package.dependencies] @@ -965,355 +834,231 @@ six = ">=1.5" [[package]] name = "pytz" -version = "2022.2.1" +version = "2025.2" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" files = [ - {file = "pytz-2022.2.1-py2.py3-none-any.whl", hash = "sha256:220f481bdafa09c3955dfbdddb7b57780e9a94f5127e35456a48589b9e0c0197"}, - {file = "pytz-2022.2.1.tar.gz", hash = "sha256:cea221417204f2d1a2aa03ddae3e867921971d0d76f14d87abb4414415bbdcf5"}, -] - -[[package]] -name = "pywin32-ctypes" -version = "0.2.0" -description = "" -optional = false -python-versions = "*" -files = [ - {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, - {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, + {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, + {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, ] [[package]] name = "pyyaml" -version = "6.0" +version = "6.0.3" description = "YAML parser and emitter for Python" optional = false -python-versions = ">=3.6" -files = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, -] - -[[package]] -name = "readme-renderer" -version = "37.1" -description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" -optional = false -python-versions = ">=3.7" -files = [ - {file = "readme_renderer-37.1-py3-none-any.whl", hash = "sha256:16c914ca7731fd062a316a2a8e5434a175ee34661a608af771a60c881f528a34"}, - {file = "readme_renderer-37.1.tar.gz", hash = "sha256:96768c069729f69176f514477e57f2f8cd543fbb2cd7bad372976249fa509a0c"}, -] - -[package.dependencies] -bleach = ">=2.1.0" -docutils = ">=0.13.1" -Pygments = ">=2.5.1" - -[package.extras] -md = ["cmarkgfm (>=0.8.0)"] - -[[package]] -name = "requests" -version = "2.31.0" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.7" -files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[[package]] -name = "requests-toolbelt" -version = "0.9.1" -description = "A utility belt for advanced users of python-requests" -optional = false -python-versions = "*" -files = [ - {file = "requests-toolbelt-0.9.1.tar.gz", hash = "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"}, - {file = "requests_toolbelt-0.9.1-py2.py3-none-any.whl", hash = "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f"}, -] - -[package.dependencies] -requests = ">=2.0.1,<3.0.0" - -[[package]] -name = "rfc3986" -version = "2.0.0" -description = "Validating URI References per RFC 3986" -optional = false -python-versions = ">=3.7" -files = [ - {file = "rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd"}, - {file = "rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c"}, -] - -[package.extras] -idna2008 = ["idna"] - -[[package]] -name = "safety" -version = "1.10.3" -description = "Checks installed dependencies for known vulnerabilities." -optional = false -python-versions = ">=3.5" -files = [ - {file = "safety-1.10.3-py2.py3-none-any.whl", hash = "sha256:5f802ad5df5614f9622d8d71fedec2757099705c2356f862847c58c6dfe13e84"}, - {file = "safety-1.10.3.tar.gz", hash = "sha256:30e394d02a20ac49b7f65292d19d38fa927a8f9582cdfd3ad1adbbc66c641ad5"}, -] - -[package.dependencies] -Click = ">=6.0" -dparse = ">=0.5.1" -packaging = "*" -requests = "*" -setuptools = "*" - -[[package]] -name = "secretstorage" -version = "3.3.3" -description = "Python bindings to FreeDesktop.org Secret Service API" -optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, - {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, + {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"}, + {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"}, + {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"}, + {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"}, + {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"}, + {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198"}, + {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0"}, + {file = "pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69"}, + {file = "pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e"}, + {file = "pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e"}, + {file = "pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00"}, + {file = "pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a"}, + {file = "pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4"}, + {file = "pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b"}, + {file = "pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196"}, + {file = "pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c"}, + {file = "pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e"}, + {file = "pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea"}, + {file = "pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b"}, + {file = "pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8"}, + {file = "pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5"}, + {file = "pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6"}, + {file = "pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be"}, + {file = "pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c"}, + {file = "pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac"}, + {file = "pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788"}, + {file = "pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764"}, + {file = "pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac"}, + {file = "pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3"}, + {file = "pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702"}, + {file = "pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065"}, + {file = "pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9"}, + {file = "pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da"}, + {file = "pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5"}, + {file = "pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926"}, + {file = "pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7"}, + {file = "pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0"}, + {file = "pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007"}, + {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, ] -[package.dependencies] -cryptography = ">=2.0" -jeepney = ">=0.6" - [[package]] name = "setuptools" -version = "70.0.0" +version = "80.9.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, - {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, + {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, + {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] +core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" -version = "1.16.0" +version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - -[[package]] -name = "snowballstemmer" -version = "2.2.0" -description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." -optional = false -python-versions = "*" -files = [ - {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, - {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, -] - -[[package]] -name = "toml" -version = "0.10.2" -description = "Python Library for Tom's Obvious, Minimal Language" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ - {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, - {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] [[package]] name = "tomli" -version = "1.2.3" +version = "2.4.0" description = "A lil' TOML parser" optional = false -python-versions = ">=3.6" -files = [ - {file = "tomli-1.2.3-py3-none-any.whl", hash = "sha256:e3069e4be3ead9668e21cb9b074cd948f7b3113fd9c8bba083f48247aab8b11c"}, - {file = "tomli-1.2.3.tar.gz", hash = "sha256:05b6166bff487dc068d322585c7ea4ef78deed501cc124060e0f238e89a9231f"}, -] - -[[package]] -name = "tqdm" -version = "4.64.1" -description = "Fast, Extensible Progress Meter" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" -files = [ - {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"}, - {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[package.extras] -dev = ["py-make (>=0.1.0)", "twine", "wheel"] -notebook = ["ipywidgets (>=6)"] -slack = ["slack-sdk"] -telegram = ["requests"] - -[[package]] -name = "twine" -version = "3.8.0" -description = "Collection of utilities for publishing packages on PyPI" -optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "twine-3.8.0-py3-none-any.whl", hash = "sha256:d0550fca9dc19f3d5e8eadfce0c227294df0a2a951251a4385797c8a6198b7c8"}, - {file = "twine-3.8.0.tar.gz", hash = "sha256:8efa52658e0ae770686a13b675569328f1fba9837e5de1867bfe5f46a9aefe19"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, + {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95"}, + {file = "tomli-2.4.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d"}, + {file = "tomli-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576"}, + {file = "tomli-2.4.0-cp311-cp311-win32.whl", hash = "sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a"}, + {file = "tomli-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa"}, + {file = "tomli-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1"}, + {file = "tomli-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a"}, + {file = "tomli-2.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b"}, + {file = "tomli-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51"}, + {file = "tomli-2.4.0-cp312-cp312-win32.whl", hash = "sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729"}, + {file = "tomli-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da"}, + {file = "tomli-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0"}, + {file = "tomli-2.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4"}, + {file = "tomli-2.4.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c"}, + {file = "tomli-2.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f"}, + {file = "tomli-2.4.0-cp313-cp313-win32.whl", hash = "sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86"}, + {file = "tomli-2.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87"}, + {file = "tomli-2.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6"}, + {file = "tomli-2.4.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66"}, + {file = "tomli-2.4.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702"}, + {file = "tomli-2.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8"}, + {file = "tomli-2.4.0-cp314-cp314-win32.whl", hash = "sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776"}, + {file = "tomli-2.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475"}, + {file = "tomli-2.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9"}, + {file = "tomli-2.4.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df"}, + {file = "tomli-2.4.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f"}, + {file = "tomli-2.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b"}, + {file = "tomli-2.4.0-cp314-cp314t-win32.whl", hash = "sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"}, + {file = "tomli-2.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd"}, + {file = "tomli-2.4.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4"}, + {file = "tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a"}, + {file = "tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c"}, ] -[package.dependencies] -colorama = ">=0.4.3" -importlib-metadata = ">=3.6" -keyring = ">=15.1" -pkginfo = ">=1.8.1" -readme-renderer = ">=21.0" -requests = ">=2.20" -requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" -rfc3986 = ">=1.4.0" -tqdm = ">=4.14" -urllib3 = ">=1.26.0" - [[package]] name = "typing-extensions" -version = "4.3.0" -description = "Backported and Experimental Type Hints for Python 3.7+" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, - {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] [[package]] -name = "urllib3" -version = "1.26.12" -description = "HTTP library with thread-safe connection pooling, file post, and more." +name = "tzdata" +version = "2025.3" +description = "Provider of IANA time zone data" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" +python-versions = ">=2" files = [ - {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, - {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, + {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, + {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, ] -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] - [[package]] name = "virtualenv" -version = "20.16.5" +version = "20.36.1" description = "Virtual Python Environment builder" optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "virtualenv-20.16.5-py3-none-any.whl", hash = "sha256:d07dfc5df5e4e0dbc92862350ad87a36ed505b978f6c39609dc489eadd5b0d27"}, - {file = "virtualenv-20.16.5.tar.gz", hash = "sha256:227ea1b9994fdc5ea31977ba3383ef296d7472ea85be9d6732e42a91c04e80da"}, + {file = "virtualenv-20.36.1-py3-none-any.whl", hash = "sha256:575a8d6b124ef88f6f51d56d656132389f961062a9177016a50e4f507bbcc19f"}, + {file = "virtualenv-20.36.1.tar.gz", hash = "sha256:8befb5c81842c641f8ee658481e42641c68b5eab3521d8e092d18320902466ba"}, ] [package.dependencies] -distlib = ">=0.3.5,<1" -filelock = ">=3.4.1,<4" -platformdirs = ">=2.4,<3" - -[package.extras] -docs = ["proselint (>=0.13)", "sphinx (>=5.1.1)", "sphinx-argparse (>=0.3.1)", "sphinx-rtd-theme (>=1)", "towncrier (>=21.9)"] -testing = ["coverage (>=6.2)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=21.3)", "pytest (>=7.0.1)", "pytest-env (>=0.6.2)", "pytest-freezegun (>=0.4.2)", "pytest-mock (>=3.6.1)", "pytest-randomly (>=3.10.3)", "pytest-timeout (>=2.1)"] - -[[package]] -name = "webencodings" -version = "0.5.1" -description = "Character encoding aliases for legacy web content" -optional = false -python-versions = "*" -files = [ - {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, - {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, -] - -[[package]] -name = "zipp" -version = "3.8.1" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.7" -files = [ - {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, - {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, +distlib = ">=0.3.7,<1" +filelock = [ + {version = ">=3.16.1,<4", markers = "python_version < \"3.10\""}, + {version = ">=3.20.1,<4", markers = "python_version >= \"3.10\""}, ] +platformdirs = ">=3.9.1,<5" +typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""} [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[extras] -pyspark = ["pyspark"] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] [metadata] lock-version = "2.0" -python-versions = ">=3.8,<4" -content-hash = "19b8066a730bfeebe017b2f1be860d76a005a46d64784d9458c555e018c77be5" +python-versions = ">=3.9,<4" +content-hash = "18db29f1829ab8baebdd68c486c74b5e7e4304a6d344a26773685b07b85fe7c3" diff --git a/pydeequ/__init__.py b/pydeequ/__init__.py index 49a06e5..6d2202f 100644 --- a/pydeequ/__init__.py +++ b/pydeequ/__init__.py @@ -11,35 +11,89 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -"""Placeholder docstrings""" -__version__ = "1.2.0" +""" +PyDeequ - Python API for Deequ data quality library. -from pyspark.sql import SparkSession +For PyDeequ 2.0 (Spark Connect), use: + from pydeequ.v2 import VerificationSuite, Check, CheckLevel + from pydeequ.v2.predicates import eq, gte -from pydeequ.analyzers import AnalysisRunner -from pydeequ.checks import Check, CheckLevel -from pydeequ.configs import DEEQU_MAVEN_COORD -from pydeequ.profiles import ColumnProfilerRunner +For PyDeequ 1.x (Legacy Py4J), set SPARK_VERSION env var and use: + from pydeequ import deequ_maven_coord + from pydeequ.checks import Check, CheckLevel +""" +__version__ = "2.0.0b1" -deequ_maven_coord = DEEQU_MAVEN_COORD -f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" +# Legacy imports are deferred to avoid requiring SPARK_VERSION for V2 users. +# V2 users should import from pydeequ.v2 directly. +_deequ_maven_coord = None +_f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all" -class PyDeequSession: - """ - For interacting with PyDeequ Modules at the "Runner" Level - """ - def __init__(self, spark_session: SparkSession): - self._spark_session = spark_session - self._sc = spark_session.sparkContext - self._jvm = spark_session._jvm +def __getattr__(name): + """Lazy loading for legacy module attributes.""" + global _deequ_maven_coord - def createColumnProfileRunner(self): - return ColumnProfilerRunner(self._spark_session) + if name == "deequ_maven_coord": + if _deequ_maven_coord is None: + from pydeequ.configs import DEEQU_MAVEN_COORD + _deequ_maven_coord = DEEQU_MAVEN_COORD + return _deequ_maven_coord - def createAnalysisRunner(self): - return AnalysisRunner(self._spark_session) + if name == "f2j_maven_coord": + return _f2j_maven_coord - def createCheck(self, level: CheckLevel, description: str, constraints=None): - return Check(self._spark_session, level, description, constraints) + if name in ("AnalysisRunner", "Check", "CheckLevel", "ColumnProfilerRunner", + "PyDeequSession", "DEEQU_MAVEN_COORD"): + # Import legacy modules on demand + if name == "AnalysisRunner": + from pydeequ.analyzers import AnalysisRunner + return AnalysisRunner + elif name == "Check": + from pydeequ.checks import Check + return Check + elif name == "CheckLevel": + from pydeequ.checks import CheckLevel + return CheckLevel + elif name == "ColumnProfilerRunner": + from pydeequ.profiles import ColumnProfilerRunner + return ColumnProfilerRunner + elif name == "DEEQU_MAVEN_COORD": + from pydeequ.configs import DEEQU_MAVEN_COORD + return DEEQU_MAVEN_COORD + + if name == "PyDeequSession": + # Return the lazily-defined class + return _get_pydeequ_session_class() + + raise AttributeError(f"module 'pydeequ' has no attribute '{name}'") + + +def _get_pydeequ_session_class(): + """Lazily create PyDeequSession class to avoid importing SparkSession at module load.""" + from pyspark.sql import SparkSession + from pydeequ.analyzers import AnalysisRunner + from pydeequ.checks import Check, CheckLevel + from pydeequ.profiles import ColumnProfilerRunner + + class PyDeequSession: + """ + For interacting with PyDeequ Modules at the "Runner" Level + """ + + def __init__(self, spark_session: SparkSession): + self._spark_session = spark_session + self._sc = spark_session.sparkContext + self._jvm = spark_session._jvm + + def createColumnProfileRunner(self): + return ColumnProfilerRunner(self._spark_session) + + def createAnalysisRunner(self): + return AnalysisRunner(self._spark_session) + + def createCheck(self, level: CheckLevel, description: str, constraints=None): + return Check(self._spark_session, level, description, constraints) + + return PyDeequSession diff --git a/pydeequ/analyzers.py b/pydeequ/analyzers.py index 3952c93..fa711c1 100644 --- a/pydeequ/analyzers.py +++ b/pydeequ/analyzers.py @@ -10,7 +10,6 @@ from pydeequ.repository import MetricsRepository, ResultKey from enum import Enum from pydeequ.scala_utils import to_scala_seq -from pydeequ.configs import SPARK_VERSION class _AnalyzerObject: """ @@ -852,4 +851,4 @@ def _create_java_object(self, jvm): elif self == DataTypeInstances.Fractional: return dataType_analyzers_class.Fractional() else: - raise ValueError(f"{jvm} is not a valid datatype Object") \ No newline at end of file + raise ValueError(f"{jvm} is not a valid datatype Object") diff --git a/pydeequ/checks.py b/pydeequ/checks.py index 749f74d..c0c6796 100644 --- a/pydeequ/checks.py +++ b/pydeequ/checks.py @@ -6,7 +6,6 @@ from pydeequ.check_functions import is_one from pydeequ.scala_utils import ScalaFunction1, to_scala_seq -from pydeequ.configs import SPARK_VERSION # TODO implement custom assertions # TODO implement all methods without outside class dependencies diff --git a/pydeequ/v2/__init__.py b/pydeequ/v2/__init__.py new file mode 100644 index 0000000..cefe70d --- /dev/null +++ b/pydeequ/v2/__init__.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +""" +PyDeequ Spark Connect Module. + +This module provides Spark Connect compatible implementations of PyDeequ's +data quality verification capabilities. It replaces the Py4J-based bridge +with a protobuf-based communication protocol that works with Spark Connect's +client-server architecture. + +Key differences from the legacy Py4J-based PyDeequ: +1. Uses serializable predicates instead of Python lambdas +2. Communicates via protobuf messages over gRPC +3. No direct JVM access required + +Example usage: + from pyspark.sql import SparkSession + from pydeequ.v2 import VerificationSuite, Check, CheckLevel + from pydeequ.v2.predicates import gte, eq + + # Connect to Spark Connect server + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + + # Create a check with constraints + check = (Check(CheckLevel.Error, "Data quality check") + .isComplete("id") + .hasCompleteness("email", gte(0.95)) + .hasSize(eq(1000))) + + # Run verification + result = (VerificationSuite(spark) + .onData(df) + .addCheck(check) + .run()) + + # Result is a DataFrame with check results + result.show() +""" + +# Import analyzers +from pydeequ.v2.analyzers import ( + ApproxCountDistinct, + ApproxQuantile, + Completeness, + Compliance, + Correlation, + CountDistinct, + DataType, + Distinctness, + Entropy, + Histogram, + Maximum, + MaxLength, + Mean, + Minimum, + MinLength, + MutualInformation, + PatternMatch, + Size, + StandardDeviation, + Sum, + Uniqueness, + UniqueValueRatio, +) + +# Import checks +from pydeequ.v2.checks import ( + Check, + CheckLevel, +) + +# Import predicates +from pydeequ.v2.predicates import ( + Predicate, + between, + eq, + gt, + gte, + is_non_negative, + is_one, + is_positive, + is_zero, + lt, + lte, + neq, +) + +# Import profiles +from pydeequ.v2.profiles import ( + ColumnProfilerRunner, + ColumnProfilerRunBuilder, + KLLParameters, +) + +# Import suggestions +from pydeequ.v2.suggestions import ( + ConstraintSuggestionRunner, + ConstraintSuggestionRunBuilder, + Rules, +) + +# Import verification +from pydeequ.v2.verification import ( + AnalysisRunBuilder, + AnalysisRunner, + VerificationRunBuilder, + VerificationSuite, +) + +__all__ = [ + # Predicates + "Predicate", + "eq", + "neq", + "gt", + "gte", + "lt", + "lte", + "between", + "is_one", + "is_zero", + "is_positive", + "is_non_negative", + # Checks + "Check", + "CheckLevel", + # Analyzers + "Size", + "Completeness", + "Mean", + "Sum", + "Maximum", + "Minimum", + "StandardDeviation", + "Distinctness", + "Uniqueness", + "UniqueValueRatio", + "CountDistinct", + "ApproxCountDistinct", + "ApproxQuantile", + "Correlation", + "MutualInformation", + "MaxLength", + "MinLength", + "PatternMatch", + "Compliance", + "Entropy", + "Histogram", + "DataType", + # Profiles + "ColumnProfilerRunner", + "ColumnProfilerRunBuilder", + "KLLParameters", + # Suggestions + "ConstraintSuggestionRunner", + "ConstraintSuggestionRunBuilder", + "Rules", + # Verification + "VerificationSuite", + "VerificationRunBuilder", + "AnalysisRunner", + "AnalysisRunBuilder", +] diff --git a/pydeequ/v2/analyzers.py b/pydeequ/v2/analyzers.py new file mode 100644 index 0000000..53a979c --- /dev/null +++ b/pydeequ/v2/analyzers.py @@ -0,0 +1,712 @@ +# -*- coding: utf-8 -*- +""" +Analyzer classes for Deequ Spark Connect. + +This module provides Spark Connect compatible analyzer classes that build +protobuf messages instead of using Py4J to call Scala code directly. + +Example usage: + from pydeequ.v2.analyzers import ( + AnalysisRunner, AnalyzerContext, + Size, Completeness, Mean, Maximum, Minimum + ) + + result = (AnalysisRunner(spark) + .onData(df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("email")) + .addAnalyzer(Mean("amount")) + .run()) + + metrics = AnalyzerContext.successMetricsAsDataFrame(result) +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import List, Optional, Sequence, Union + +from pydeequ.v2.proto import deequ_connect_pb2 as proto + + +class _ConnectAnalyzer(ABC): + """Base class for Spark Connect compatible analyzers.""" + + @abstractmethod + def to_proto(self) -> proto.AnalyzerMessage: + """Convert analyzer to protobuf message.""" + raise NotImplementedError + + @abstractmethod + def __repr__(self) -> str: + raise NotImplementedError + + +# ============================================================================ +# Size Analyzer +# ============================================================================ + + +@dataclass +class Size(_ConnectAnalyzer): + """ + Computes the number of rows in a DataFrame. + + Args: + where: Optional SQL WHERE clause to filter rows before counting + + Example: + Size() # Count all rows + Size(where="status = 'active'") # Count only active rows + """ + + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Size") + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Size(where='{self.where}')" + return "Size()" + + +# ============================================================================ +# Completeness Analyzers +# ============================================================================ + + +@dataclass +class Completeness(_ConnectAnalyzer): + """ + Computes the fraction of non-null values in a column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + + Example: + Completeness("email") + Completeness("email", where="status = 'active'") + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Completeness", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Completeness('{self.column}', where='{self.where}')" + return f"Completeness('{self.column}')" + + +# ============================================================================ +# Statistical Analyzers +# ============================================================================ + + +@dataclass +class Mean(_ConnectAnalyzer): + """ + Computes the mean of a numeric column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Mean", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Mean('{self.column}', where='{self.where}')" + return f"Mean('{self.column}')" + + +@dataclass +class Sum(_ConnectAnalyzer): + """ + Computes the sum of a numeric column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Sum", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Sum('{self.column}', where='{self.where}')" + return f"Sum('{self.column}')" + + +@dataclass +class Maximum(_ConnectAnalyzer): + """ + Computes the maximum value of a numeric column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Maximum", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Maximum('{self.column}', where='{self.where}')" + return f"Maximum('{self.column}')" + + +@dataclass +class Minimum(_ConnectAnalyzer): + """ + Computes the minimum value of a numeric column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Minimum", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"Minimum('{self.column}', where='{self.where}')" + return f"Minimum('{self.column}')" + + +@dataclass +class StandardDeviation(_ConnectAnalyzer): + """ + Computes the standard deviation of a numeric column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="StandardDeviation", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + if self.where: + return f"StandardDeviation('{self.column}', where='{self.where}')" + return f"StandardDeviation('{self.column}')" + + +# ============================================================================ +# Uniqueness Analyzers +# ============================================================================ + + +@dataclass +class Distinctness(_ConnectAnalyzer): + """ + Computes the fraction of distinct values in column(s). + + Args: + columns: Column name(s) to analyze + where: Optional SQL WHERE clause to filter rows + """ + + columns: Union[str, Sequence[str]] + where: Optional[str] = None + + def __post_init__(self): + if isinstance(self.columns, str): + self.columns = [self.columns] + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Distinctness") + msg.columns.extend(self.columns) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Distinctness({self.columns})" + + +@dataclass +class Uniqueness(_ConnectAnalyzer): + """ + Computes the fraction of unique values (appearing exactly once) in column(s). + + Args: + columns: Column name(s) to analyze + where: Optional SQL WHERE clause to filter rows + """ + + columns: Union[str, Sequence[str]] + where: Optional[str] = None + + def __post_init__(self): + if isinstance(self.columns, str): + self.columns = [self.columns] + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Uniqueness") + msg.columns.extend(self.columns) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Uniqueness({self.columns})" + + +@dataclass +class UniqueValueRatio(_ConnectAnalyzer): + """ + Computes the ratio of unique values to total distinct values. + + Args: + columns: Column name(s) to analyze + where: Optional SQL WHERE clause to filter rows + """ + + columns: Union[str, Sequence[str]] + where: Optional[str] = None + + def __post_init__(self): + if isinstance(self.columns, str): + self.columns = [self.columns] + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="UniqueValueRatio") + msg.columns.extend(self.columns) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"UniqueValueRatio({self.columns})" + + +@dataclass +class CountDistinct(_ConnectAnalyzer): + """ + Computes the count of distinct values in column(s). + + Args: + columns: Column name(s) to analyze + """ + + columns: Union[str, Sequence[str]] + + def __post_init__(self): + if isinstance(self.columns, str): + self.columns = [self.columns] + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="CountDistinct") + msg.columns.extend(self.columns) + return msg + + def __repr__(self) -> str: + return f"CountDistinct({self.columns})" + + +@dataclass +class ApproxCountDistinct(_ConnectAnalyzer): + """ + Computes approximate count distinct using HyperLogLog. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="ApproxCountDistinct", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"ApproxCountDistinct('{self.column}')" + + +# ============================================================================ +# Quantile Analyzers +# ============================================================================ + + +@dataclass +class ApproxQuantile(_ConnectAnalyzer): + """ + Computes an approximate quantile of a numeric column. + + Args: + column: Column name to analyze + quantile: Quantile to compute (0.0 to 1.0) + relative_error: Relative error tolerance (default 0.01) + where: Optional SQL WHERE clause to filter rows + """ + + column: str + quantile: float + relative_error: float = 0.01 + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage( + type="ApproxQuantile", + column=self.column, + quantile=self.quantile, + relative_error=self.relative_error, + ) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"ApproxQuantile('{self.column}', {self.quantile})" + + +# ============================================================================ +# Correlation Analyzers +# ============================================================================ + + +@dataclass +class Correlation(_ConnectAnalyzer): + """ + Computes Pearson correlation between two columns. + + Args: + column1: First column name + column2: Second column name + where: Optional SQL WHERE clause to filter rows + """ + + column1: str + column2: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Correlation") + msg.columns.extend([self.column1, self.column2]) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Correlation('{self.column1}', '{self.column2}')" + + +@dataclass +class MutualInformation(_ConnectAnalyzer): + """ + Computes mutual information between columns. + + Args: + columns: Column names to analyze + where: Optional SQL WHERE clause to filter rows + """ + + columns: Sequence[str] + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="MutualInformation") + msg.columns.extend(self.columns) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"MutualInformation({self.columns})" + + +# ============================================================================ +# String Analyzers +# ============================================================================ + + +@dataclass +class MaxLength(_ConnectAnalyzer): + """ + Computes the maximum string length in a column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="MaxLength", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"MaxLength('{self.column}')" + + +@dataclass +class MinLength(_ConnectAnalyzer): + """ + Computes the minimum string length in a column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="MinLength", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"MinLength('{self.column}')" + + +# ============================================================================ +# Pattern Analyzers +# ============================================================================ + + +@dataclass +class PatternMatch(_ConnectAnalyzer): + """ + Computes the fraction of values matching a regex pattern. + + Args: + column: Column name to analyze + pattern: Regex pattern to match + where: Optional SQL WHERE clause to filter rows + """ + + column: str + pattern: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage( + type="PatternMatch", column=self.column, pattern=self.pattern + ) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"PatternMatch('{self.column}', '{self.pattern}')" + + +# ============================================================================ +# Compliance Analyzer +# ============================================================================ + + +@dataclass +class Compliance(_ConnectAnalyzer): + """ + Computes the fraction of rows satisfying a SQL condition. + + Args: + instance: Name for this compliance check + predicate: SQL predicate (WHERE clause condition) + where: Optional additional SQL WHERE clause to filter rows + """ + + instance: str + predicate: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + # Use column for instance name and pattern for predicate + msg = proto.AnalyzerMessage( + type="Compliance", column=self.instance, pattern=self.predicate + ) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Compliance('{self.instance}', '{self.predicate}')" + + +# ============================================================================ +# Entropy Analyzer +# ============================================================================ + + +@dataclass +class Entropy(_ConnectAnalyzer): + """ + Computes the entropy of a column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Entropy", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Entropy('{self.column}')" + + +# ============================================================================ +# Histogram Analyzer +# ============================================================================ + + +@dataclass +class Histogram(_ConnectAnalyzer): + """ + Computes histogram of values in a column. + + Args: + column: Column name to analyze + max_detail_bins: Maximum number of bins for detailed output + where: Optional SQL WHERE clause to filter rows + """ + + column: str + max_detail_bins: Optional[int] = None + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="Histogram", column=self.column) + if self.max_detail_bins is not None: + msg.max_detail_bins = self.max_detail_bins + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"Histogram('{self.column}')" + + +# ============================================================================ +# DataType Analyzer +# ============================================================================ + + +@dataclass +class DataType(_ConnectAnalyzer): + """ + Analyzes the data types present in a column. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause to filter rows + """ + + column: str + where: Optional[str] = None + + def to_proto(self) -> proto.AnalyzerMessage: + msg = proto.AnalyzerMessage(type="DataType", column=self.column) + if self.where: + msg.where = self.where + return msg + + def __repr__(self) -> str: + return f"DataType('{self.column}')" + + +# Export all public symbols +__all__ = [ + # Base class + "_ConnectAnalyzer", + # Size + "Size", + # Completeness + "Completeness", + # Statistical + "Mean", + "Sum", + "Maximum", + "Minimum", + "StandardDeviation", + # Uniqueness + "Distinctness", + "Uniqueness", + "UniqueValueRatio", + "CountDistinct", + "ApproxCountDistinct", + # Quantile + "ApproxQuantile", + # Correlation + "Correlation", + "MutualInformation", + # String + "MaxLength", + "MinLength", + # Pattern + "PatternMatch", + # Compliance + "Compliance", + # Entropy + "Entropy", + # Histogram + "Histogram", + # DataType + "DataType", +] diff --git a/pydeequ/v2/checks.py b/pydeequ/v2/checks.py new file mode 100644 index 0000000..2a86ba8 --- /dev/null +++ b/pydeequ/v2/checks.py @@ -0,0 +1,938 @@ +# -*- coding: utf-8 -*- +""" +Check class for Deequ Spark Connect. + +This module provides a Spark Connect compatible Check class that builds +protobuf messages instead of using Py4J to call Scala code directly. + +Example usage: + from pydeequ.v2.checks import Check, CheckLevel + from pydeequ.v2.predicates import gte, eq, between + + check = (Check(CheckLevel.Error, "Data quality check") + .isComplete("id") + .hasCompleteness("email", gte(0.95)) + .hasSize(eq(1000)) + .hasMean("amount", between(100, 500))) +""" + +from __future__ import annotations + +from enum import Enum +from typing import List, Optional, Sequence, Union + +from pydeequ.v2.predicates import Predicate, is_one +from pydeequ.v2.proto import deequ_connect_pb2 as proto + + +class CheckLevel(Enum): + """Check severity level.""" + + Error = "Error" + Warning = "Warning" + + +class Check: + """ + Check class for Spark Connect - builds protobuf messages. + + A Check is a collection of constraints that can be applied to a DataFrame. + When the Check is run, each constraint is evaluated and the results are + aggregated based on the Check's level (Error or Warning). + + Unlike the Py4J-based Check, this class does not require a SparkSession + at construction time since it only builds protobuf messages. + + Example: + check = (Check(CheckLevel.Error, "Data quality check") + .isComplete("id") + .hasCompleteness("email", gte(0.95)) + .hasSize(eq(1000))) + """ + + def __init__(self, level: CheckLevel, description: str): + """ + Create a new Check. + + Args: + level: The severity level (Error or Warning) + description: Human-readable description of this check + """ + self.level = level + self.description = description + self._constraints: List[proto.ConstraintMessage] = [] + + def _add_constraint( + self, + constraint_type: str, + column: Optional[str] = None, + columns: Optional[Sequence[str]] = None, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + where: Optional[str] = None, + pattern: Optional[str] = None, + column_condition: Optional[str] = None, + constraint_name: Optional[str] = None, + allowed_values: Optional[Sequence[str]] = None, + quantile: Optional[float] = None, + ) -> "Check": + """Internal method to add a constraint.""" + constraint = proto.ConstraintMessage(type=constraint_type) + + if column is not None: + constraint.column = column + if columns is not None: + constraint.columns.extend(columns) + if assertion is not None: + constraint.assertion.CopyFrom(assertion.to_proto()) + if hint is not None: + constraint.hint = hint + if where is not None: + constraint.where = where + if pattern is not None: + constraint.pattern = pattern + if column_condition is not None: + constraint.column_condition = column_condition + if constraint_name is not None: + constraint.constraint_name = constraint_name + if allowed_values is not None: + constraint.allowed_values.extend(allowed_values) + if quantile is not None: + constraint.quantile = quantile + + self._constraints.append(constraint) + return self + + # ======================================================================== + # Size Constraints + # ======================================================================== + + def hasSize(self, assertion: Predicate, hint: Optional[str] = None) -> "Check": + """ + Check that the DataFrame has a size satisfying the assertion. + + Args: + assertion: Predicate to apply to the row count + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasSize(eq(1000)) # Must have exactly 1000 rows + check.hasSize(gte(100)) # Must have at least 100 rows + """ + return self._add_constraint("hasSize", assertion=assertion, hint=hint) + + # ======================================================================== + # Completeness Constraints + # ======================================================================== + + def isComplete(self, column: str, hint: Optional[str] = None) -> "Check": + """ + Check that a column has no null values (100% complete). + + Args: + column: Column name to check + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.isComplete("id") # id column must have no nulls + """ + return self._add_constraint( + "isComplete", column=column, assertion=is_one(), hint=hint + ) + + def hasCompleteness( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that a column's completeness satisfies the assertion. + + Completeness is the fraction of non-null values (0.0 to 1.0). + + Args: + column: Column name to check + assertion: Predicate to apply to completeness value + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasCompleteness("email", gte(0.95)) # At least 95% complete + """ + return self._add_constraint( + "hasCompleteness", column=column, assertion=assertion, hint=hint + ) + + def areComplete( + self, columns: Sequence[str], hint: Optional[str] = None + ) -> "Check": + """ + Check that all specified columns have no null values. + + Args: + columns: Column names to check + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.areComplete(["id", "name", "email"]) + """ + return self._add_constraint( + "areComplete", columns=columns, assertion=is_one(), hint=hint + ) + + def haveCompleteness( + self, columns: Sequence[str], assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that combined completeness of columns satisfies the assertion. + + Args: + columns: Column names to check + assertion: Predicate to apply to completeness value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "haveCompleteness", columns=columns, assertion=assertion, hint=hint + ) + + # ======================================================================== + # Uniqueness Constraints + # ======================================================================== + + def isUnique(self, column: str, hint: Optional[str] = None) -> "Check": + """ + Check that a column has only unique values. + + Args: + column: Column name to check + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.isUnique("id") # id must be unique + """ + return self._add_constraint("isUnique", column=column, hint=hint) + + def hasUniqueness( + self, columns: Sequence[str], assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that uniqueness of column(s) satisfies the assertion. + + Uniqueness is the fraction of unique values. + + Args: + columns: Column names to check + assertion: Predicate to apply to uniqueness value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasUniqueness", columns=columns, assertion=assertion, hint=hint + ) + + def hasDistinctness( + self, columns: Sequence[str], assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that distinctness of column(s) satisfies the assertion. + + Distinctness is the fraction of distinct values. + + Args: + columns: Column names to check + assertion: Predicate to apply to distinctness value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasDistinctness", columns=columns, assertion=assertion, hint=hint + ) + + def hasUniqueValueRatio( + self, columns: Sequence[str], assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that unique value ratio of column(s) satisfies the assertion. + + Args: + columns: Column names to check + assertion: Predicate to apply to ratio value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasUniqueValueRatio", columns=columns, assertion=assertion, hint=hint + ) + + # ======================================================================== + # Statistical Constraints + # ======================================================================== + + def hasMin( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the minimum value of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to minimum value + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasMin("age", gte(0)) # Age must be non-negative + """ + return self._add_constraint( + "hasMin", column=column, assertion=assertion, hint=hint + ) + + def hasMax( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the maximum value of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to maximum value + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasMax("price", lte(10000)) # Price must be <= 10000 + """ + return self._add_constraint( + "hasMax", column=column, assertion=assertion, hint=hint + ) + + def hasMean( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the mean value of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to mean value + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasMean("score", between(70, 90)) + """ + return self._add_constraint( + "hasMean", column=column, assertion=assertion, hint=hint + ) + + def hasSum( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the sum of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to sum value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasSum", column=column, assertion=assertion, hint=hint + ) + + def hasStandardDeviation( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the standard deviation of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to std dev value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasStandardDeviation", column=column, assertion=assertion, hint=hint + ) + + def hasApproxCountDistinct( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the approximate count distinct satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to count distinct value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasApproxCountDistinct", column=column, assertion=assertion, hint=hint + ) + + def hasApproxQuantile( + self, + column: str, + quantile: float, + assertion: Predicate, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that an approximate quantile satisfies the assertion. + + Args: + column: Column name to check + quantile: Quantile to compute (0.0 to 1.0) + assertion: Predicate to apply to quantile value + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasApproxQuantile("income", 0.5, between(30000, 80000)) # Median + """ + return self._add_constraint( + "hasApproxQuantile", + column=column, + quantile=quantile, + assertion=assertion, + hint=hint, + ) + + def hasCorrelation( + self, + column_a: str, + column_b: str, + assertion: Predicate, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that correlation between two columns satisfies the assertion. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to correlation value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasCorrelation", + columns=[column_a, column_b], + assertion=assertion, + hint=hint, + ) + + def hasEntropy( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that the entropy of a column satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to entropy value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasEntropy", column=column, assertion=assertion, hint=hint + ) + + def hasMutualInformation( + self, + column_a: str, + column_b: str, + assertion: Predicate, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that mutual information between columns satisfies the assertion. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to mutual information value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasMutualInformation", + columns=[column_a, column_b], + assertion=assertion, + hint=hint, + ) + + # ======================================================================== + # String Length Constraints + # ======================================================================== + + def hasMinLength( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that minimum string length satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to min length value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasMinLength", column=column, assertion=assertion, hint=hint + ) + + def hasMaxLength( + self, column: str, assertion: Predicate, hint: Optional[str] = None + ) -> "Check": + """ + Check that maximum string length satisfies the assertion. + + Args: + column: Column name to check + assertion: Predicate to apply to max length value + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "hasMaxLength", column=column, assertion=assertion, hint=hint + ) + + # ======================================================================== + # Pattern & Content Constraints + # ======================================================================== + + def hasPattern( + self, + column: str, + pattern: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that values match a regex pattern. + + Args: + column: Column name to check + pattern: Regex pattern to match + assertion: Predicate to apply to match fraction (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.hasPattern("phone", r"^\\d{3}-\\d{3}-\\d{4}$") + """ + return self._add_constraint( + "hasPattern", + column=column, + pattern=pattern, + assertion=assertion or is_one(), + hint=hint, + ) + + def containsEmail( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that values contain valid email addresses. + + Args: + column: Column name to check + assertion: Predicate to apply to match fraction (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "containsEmail", column=column, assertion=assertion or is_one(), hint=hint + ) + + def containsURL( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that values contain valid URLs. + + Args: + column: Column name to check + assertion: Predicate to apply to match fraction (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "containsURL", column=column, assertion=assertion or is_one(), hint=hint + ) + + def containsCreditCardNumber( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that values contain valid credit card numbers. + + Args: + column: Column name to check + assertion: Predicate to apply to match fraction (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "containsCreditCardNumber", + column=column, + assertion=assertion or is_one(), + hint=hint, + ) + + def containsSocialSecurityNumber( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that values contain valid SSNs. + + Args: + column: Column name to check + assertion: Predicate to apply to match fraction (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "containsSocialSecurityNumber", + column=column, + assertion=assertion or is_one(), + hint=hint, + ) + + # ======================================================================== + # Comparison Constraints + # ======================================================================== + + def isPositive( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that all values in a column are positive. + + Args: + column: Column name to check + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isPositive", column=column, assertion=assertion or is_one(), hint=hint + ) + + def isNonNegative( + self, + column: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that all values in a column are non-negative. + + Args: + column: Column name to check + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isNonNegative", column=column, assertion=assertion or is_one(), hint=hint + ) + + def isLessThan( + self, + column_a: str, + column_b: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that column_a < column_b for all rows. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isLessThan", + columns=[column_a, column_b], + assertion=assertion or is_one(), + hint=hint, + ) + + def isLessThanOrEqualTo( + self, + column_a: str, + column_b: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that column_a <= column_b for all rows. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isLessThanOrEqualTo", + columns=[column_a, column_b], + assertion=assertion or is_one(), + hint=hint, + ) + + def isGreaterThan( + self, + column_a: str, + column_b: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that column_a > column_b for all rows. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isGreaterThan", + columns=[column_a, column_b], + assertion=assertion or is_one(), + hint=hint, + ) + + def isGreaterThanOrEqualTo( + self, + column_a: str, + column_b: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that column_a >= column_b for all rows. + + Args: + column_a: First column name + column_b: Second column name + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + """ + return self._add_constraint( + "isGreaterThanOrEqualTo", + columns=[column_a, column_b], + assertion=assertion or is_one(), + hint=hint, + ) + + def isContainedIn( + self, + column: str, + allowed_values: Sequence[str], + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that all values are in the allowed set. + + Args: + column: Column name to check + allowed_values: List of allowed values + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.isContainedIn("status", ["active", "inactive", "pending"]) + """ + return self._add_constraint( + "isContainedIn", + column=column, + allowed_values=allowed_values, + assertion=assertion or is_one(), + hint=hint, + ) + + # ======================================================================== + # Custom Constraints + # ======================================================================== + + def satisfies( + self, + column_condition: str, + constraint_name: str, + assertion: Optional[Predicate] = None, + hint: Optional[str] = None, + ) -> "Check": + """ + Check that rows satisfy a SQL condition. + + Args: + column_condition: SQL WHERE clause condition + constraint_name: Name for this constraint + assertion: Predicate to apply to compliance (default: is_one) + hint: Optional hint message for failures + + Returns: + self for method chaining + + Example: + check.satisfies("price > 0 AND quantity > 0", "positive_values") + """ + return self._add_constraint( + "satisfies", + column_condition=column_condition, + constraint_name=constraint_name, + assertion=assertion or is_one(), + hint=hint, + ) + + # ======================================================================== + # Filter (WHERE clause) + # ======================================================================== + + def where(self, filter_condition: str) -> "Check": + """ + Apply a filter to the last added constraint. + + Args: + filter_condition: SQL WHERE clause to filter rows + + Returns: + self for method chaining + + Example: + check.isComplete("email").where("status = 'active'") + """ + if self._constraints: + self._constraints[-1].where = filter_condition + return self + + # ======================================================================== + # Serialization + # ======================================================================== + + def to_proto(self) -> proto.CheckMessage: + """ + Convert this Check to a protobuf message. + + Returns: + CheckMessage protobuf + """ + level = ( + proto.CheckMessage.Level.ERROR + if self.level == CheckLevel.Error + else proto.CheckMessage.Level.WARNING + ) + + check_msg = proto.CheckMessage(level=level, description=self.description) + check_msg.constraints.extend(self._constraints) + + return check_msg + + def __repr__(self) -> str: + return f"Check(level={self.level.value}, description='{self.description}', constraints={len(self._constraints)})" + + +# Export all public symbols +__all__ = [ + "Check", + "CheckLevel", +] diff --git a/pydeequ/v2/predicates.py b/pydeequ/v2/predicates.py new file mode 100644 index 0000000..adaf23d --- /dev/null +++ b/pydeequ/v2/predicates.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- +""" +Serializable predicates for Deequ Spark Connect. + +These predicates replace Python lambda functions that were used in the Py4J-based +PyDeequ. Since lambdas cannot be serialized over Spark Connect's gRPC channel, +we use these predicate classes that serialize to protobuf messages. + +Example usage: + # Old (Py4J) - NOT serializable + check.hasSize(lambda x: x >= 100) + check.hasCompleteness("col", lambda x: x >= 0.95) + + # New (Spark Connect) - Serializable + from pydeequ.v2.predicates import gte, eq, between + + check.hasSize(gte(100)) + check.hasCompleteness("col", gte(0.95)) + check.hasMean("amount", between(100, 200)) +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Union + +from pydeequ.v2.proto import deequ_connect_pb2 as proto + + +class Predicate(ABC): + """Base class for serializable predicates.""" + + @abstractmethod + def to_proto(self) -> proto.PredicateMessage: + """Convert predicate to protobuf message.""" + raise NotImplementedError + + @abstractmethod + def __repr__(self) -> str: + raise NotImplementedError + + +@dataclass +class Comparison(Predicate): + """Comparison predicate for single-value comparisons.""" + + operator: proto.PredicateMessage.Operator + value: float + + def to_proto(self) -> proto.PredicateMessage: + return proto.PredicateMessage(operator=self.operator, value=self.value) + + def __repr__(self) -> str: + op_map = { + proto.PredicateMessage.Operator.EQ: "==", + proto.PredicateMessage.Operator.NE: "!=", + proto.PredicateMessage.Operator.GT: ">", + proto.PredicateMessage.Operator.GE: ">=", + proto.PredicateMessage.Operator.LT: "<", + proto.PredicateMessage.Operator.LE: "<=", + } + return f"x {op_map.get(self.operator, '?')} {self.value}" + + +@dataclass +class Between(Predicate): + """Between predicate for range checks (inclusive).""" + + lower: float + upper: float + + def to_proto(self) -> proto.PredicateMessage: + return proto.PredicateMessage( + operator=proto.PredicateMessage.Operator.BETWEEN, + lower_bound=self.lower, + upper_bound=self.upper, + ) + + def __repr__(self) -> str: + return f"{self.lower} <= x <= {self.upper}" + + +# ============================================================================ +# Factory Functions - Convenient way to create predicates +# ============================================================================ + + +def eq(value: Union[int, float]) -> Predicate: + """ + Create an equality predicate (x == value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric equals value + + Example: + check.hasSize(eq(100)) # size must equal 100 + """ + return Comparison(proto.PredicateMessage.Operator.EQ, float(value)) + + +def neq(value: Union[int, float]) -> Predicate: + """ + Create a not-equal predicate (x != value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric does not equal value + + Example: + check.hasSize(neq(0)) # size must not be zero + """ + return Comparison(proto.PredicateMessage.Operator.NE, float(value)) + + +def gt(value: Union[int, float]) -> Predicate: + """ + Create a greater-than predicate (x > value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric is greater than value + + Example: + check.hasSize(gt(0)) # size must be greater than 0 + """ + return Comparison(proto.PredicateMessage.Operator.GT, float(value)) + + +def gte(value: Union[int, float]) -> Predicate: + """ + Create a greater-than-or-equal predicate (x >= value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric is >= value + + Example: + check.hasCompleteness("col", gte(0.95)) # at least 95% complete + """ + return Comparison(proto.PredicateMessage.Operator.GE, float(value)) + + +def lt(value: Union[int, float]) -> Predicate: + """ + Create a less-than predicate (x < value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric is less than value + + Example: + check.hasMean("errors", lt(10)) # mean errors less than 10 + """ + return Comparison(proto.PredicateMessage.Operator.LT, float(value)) + + +def lte(value: Union[int, float]) -> Predicate: + """ + Create a less-than-or-equal predicate (x <= value). + + Args: + value: The value to compare against + + Returns: + Predicate that checks if metric is <= value + + Example: + check.hasMax("price", lte(1000)) # max price <= 1000 + """ + return Comparison(proto.PredicateMessage.Operator.LE, float(value)) + + +def between(lower: Union[int, float], upper: Union[int, float]) -> Predicate: + """ + Create a between predicate (lower <= x <= upper). + + Args: + lower: Lower bound (inclusive) + upper: Upper bound (inclusive) + + Returns: + Predicate that checks if metric is within range + + Example: + check.hasMean("age", between(18, 65)) # mean age between 18 and 65 + """ + return Between(float(lower), float(upper)) + + +def is_one() -> Predicate: + """ + Create a predicate that checks if value equals 1.0. + + This is the default assertion for many constraints like isComplete(). + + Returns: + Predicate that checks if metric equals 1.0 + + Example: + check.hasCompleteness("col", is_one()) # 100% complete + """ + return eq(1.0) + + +def is_zero() -> Predicate: + """ + Create a predicate that checks if value equals 0.0. + + Returns: + Predicate that checks if metric equals 0.0 + + Example: + check.hasMean("null_count", is_zero()) # no nulls + """ + return eq(0.0) + + +def is_positive() -> Predicate: + """ + Create a predicate that checks if value is positive (> 0). + + Returns: + Predicate that checks if metric is greater than 0 + + Example: + check.hasMin("quantity", is_positive()) # all quantities positive + """ + return gt(0.0) + + +def is_non_negative() -> Predicate: + """ + Create a predicate that checks if value is non-negative (>= 0). + + Returns: + Predicate that checks if metric is >= 0 + + Example: + check.hasMin("balance", is_non_negative()) # no negative balances + """ + return gte(0.0) + + +# Export all public symbols +__all__ = [ + # Base classes + "Predicate", + "Comparison", + "Between", + # Factory functions + "eq", + "neq", + "gt", + "gte", + "lt", + "lte", + "between", + "is_one", + "is_zero", + "is_positive", + "is_non_negative", +] diff --git a/pydeequ/v2/profiles.py b/pydeequ/v2/profiles.py new file mode 100644 index 0000000..97f71ef --- /dev/null +++ b/pydeequ/v2/profiles.py @@ -0,0 +1,282 @@ +# -*- coding: utf-8 -*- +""" +Column Profiler for Deequ Spark Connect. + +This module provides column profiling capabilities that analyze DataFrame columns +to compute statistics like completeness, data type distribution, and optional +KLL sketch-based quantile estimation. + +Example usage: + from pyspark.sql import SparkSession + from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters + + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + + # Basic profiling + profiles = (ColumnProfilerRunner(spark) + .onData(df) + .run()) + + # With KLL profiling for quantile estimation + profiles = (ColumnProfilerRunner(spark) + .onData(df) + .withKLLProfiling() + .setKLLParameters(KLLParameters(sketch_size=2048)) + .run()) + + profiles.show() # Result is a DataFrame with one row per column +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Optional, Sequence + +from google.protobuf import any_pb2 + +from pydeequ.v2.proto import deequ_connect_pb2 as proto +from pydeequ.v2.spark_helpers import create_deequ_plan, dataframe_from_plan + +if TYPE_CHECKING: + from pyspark.sql import DataFrame, SparkSession + + +@dataclass +class KLLParameters: + """ + Parameters for KLL sketch-based quantile estimation. + + KLL sketches provide memory-efficient approximate quantile computation + for numeric columns. + + Attributes: + sketch_size: Size of the KLL sketch (default: 2048) + shrinking_factor: Factor for sketch compaction (default: 0.64) + num_buckets: Number of histogram buckets for distribution (default: 64) + """ + + sketch_size: int = 2048 + shrinking_factor: float = 0.64 + num_buckets: int = 64 + + def to_proto(self) -> proto.KLLParameters: + """Convert to protobuf message.""" + return proto.KLLParameters( + sketch_size=self.sketch_size, + shrinking_factor=self.shrinking_factor, + number_of_buckets=self.num_buckets, + ) + + +class ColumnProfilerRunner: + """ + Entry point for running column profiling. + + ColumnProfilerRunner analyzes DataFrame columns to compute statistics + including completeness, data type, distinct values, and optionally + KLL sketches for numeric columns. + + Example: + profiles = (ColumnProfilerRunner(spark) + .onData(df) + .restrictToColumns(["col1", "col2"]) + .withKLLProfiling() + .run()) + """ + + def __init__(self, spark: "SparkSession"): + """ + Create a new ColumnProfilerRunner. + + Args: + spark: SparkSession (can be either local or Spark Connect) + """ + self._spark = spark + + def onData(self, df: "DataFrame") -> "ColumnProfilerRunBuilder": + """ + Specify the DataFrame to profile. + + Args: + df: DataFrame to profile + + Returns: + ColumnProfilerRunBuilder for method chaining + """ + return ColumnProfilerRunBuilder(self._spark, df) + + +class ColumnProfilerRunBuilder: + """ + Builder for configuring and executing a column profiling run. + + This class collects profiling options and executes the profiling + when run() is called. + """ + + def __init__(self, spark: "SparkSession", df: "DataFrame"): + """ + Create a new ColumnProfilerRunBuilder. + + Args: + spark: SparkSession + df: DataFrame to profile + """ + self._spark = spark + self._df = df + self._restrict_to_columns: Optional[Sequence[str]] = None + self._low_cardinality_threshold: int = 0 + self._enable_kll: bool = False + self._kll_parameters: Optional[KLLParameters] = None + self._predefined_types: Optional[Dict[str, str]] = None + + def restrictToColumns(self, columns: Sequence[str]) -> "ColumnProfilerRunBuilder": + """ + Restrict profiling to specific columns. + + Args: + columns: List of column names to profile + + Returns: + self for method chaining + """ + self._restrict_to_columns = columns + return self + + def withLowCardinalityHistogramThreshold( + self, threshold: int + ) -> "ColumnProfilerRunBuilder": + """ + Set threshold for computing histograms. + + Columns with distinct values <= threshold will have histograms computed. + + Args: + threshold: Maximum distinct values for histogram computation + + Returns: + self for method chaining + """ + self._low_cardinality_threshold = threshold + return self + + def withKLLProfiling(self) -> "ColumnProfilerRunBuilder": + """ + Enable KLL sketch profiling for numeric columns. + + KLL sketches provide approximate quantile estimation. + + Returns: + self for method chaining + """ + self._enable_kll = True + return self + + def setKLLParameters(self, params: KLLParameters) -> "ColumnProfilerRunBuilder": + """ + Set KLL sketch parameters. + + Args: + params: KLLParameters configuration + + Returns: + self for method chaining + """ + self._kll_parameters = params + return self + + def setPredefinedTypes( + self, types: Dict[str, str] + ) -> "ColumnProfilerRunBuilder": + """ + Set predefined data types for columns. + + This overrides automatic type inference for specified columns. + + Args: + types: Dictionary mapping column names to type names. + Supported types: "String", "Integer", "Long", "Double", "Boolean" + + Returns: + self for method chaining + """ + self._predefined_types = types + return self + + def run(self) -> "DataFrame": + """ + Execute the profiling and return results as a DataFrame. + + The result DataFrame contains columns: + - column: Column name + - completeness: Non-null ratio (0.0-1.0) + - approx_distinct_values: Approximate cardinality + - data_type: Detected/provided type + - is_data_type_inferred: Whether type was inferred + - type_counts: JSON string of type counts + - histogram: JSON string of histogram (or null) + - mean, minimum, maximum, sum, std_dev: Numeric stats (null for non-numeric) + - approx_percentiles: JSON array of percentiles (null if not computed) + - kll_buckets: JSON string of KLL buckets (null if KLL disabled) + + Returns: + DataFrame with profiling results (one row per column) + + Raises: + RuntimeError: If the Deequ plugin is not available on the server + """ + # Build the protobuf message + profiler_msg = self._build_profiler_message() + + # V2 only supports Spark Connect + return self._run_via_spark_connect(profiler_msg) + + def _build_profiler_message(self) -> proto.DeequColumnProfilerRelation: + """Build the protobuf profiler message.""" + msg = proto.DeequColumnProfilerRelation() + + # Set column restrictions + if self._restrict_to_columns: + msg.restrict_to_columns.extend(self._restrict_to_columns) + + # Set histogram threshold + if self._low_cardinality_threshold > 0: + msg.low_cardinality_histogram_threshold = self._low_cardinality_threshold + + # Set KLL profiling + msg.enable_kll_profiling = self._enable_kll + if self._kll_parameters: + msg.kll_parameters.CopyFrom(self._kll_parameters.to_proto()) + + # Set predefined types + if self._predefined_types: + for col, dtype in self._predefined_types.items(): + msg.predefined_types[col] = dtype + + return msg + + def _run_via_spark_connect( + self, msg: proto.DeequColumnProfilerRelation + ) -> "DataFrame": + """Execute profiling via Spark Connect plugin.""" + # Get the input DataFrame's plan as serialized bytes + input_plan = self._df._plan.to_proto(self._spark._client) + msg.input_relation = input_plan.root.SerializeToString() + + # Wrap our Deequ message in a google.protobuf.Any + extension = any_pb2.Any() + extension.Pack(msg, type_url_prefix="type.googleapis.com") + + # Create a proper LogicalPlan subclass with the extension + plan = create_deequ_plan(extension) + + # Create DataFrame from the plan (handles Spark 3.x vs 4.x) + return dataframe_from_plan(plan, self._spark) + + +# Export all public symbols +__all__ = [ + "ColumnProfilerRunner", + "ColumnProfilerRunBuilder", + "KLLParameters", +] diff --git a/pydeequ/v2/proto/__init__.py b/pydeequ/v2/proto/__init__.py new file mode 100644 index 0000000..b33a48d --- /dev/null +++ b/pydeequ/v2/proto/__init__.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Generated protobuf classes for Deequ Spark Connect. +""" + +from pydeequ.v2.proto.deequ_connect_pb2 import ( + AnalyzerMessage, + CheckMessage, + CheckStatus, + ConstraintMessage, + ConstraintStatus, + DeequAnalysisRelation, + DeequColumnProfilerRelation, + DeequConstraintSuggestionRelation, + DeequVerificationRelation, + KLLParameters, + MetricEntity, + PredicateMessage, + VerificationStatus, +) + +__all__ = [ + "DeequVerificationRelation", + "DeequAnalysisRelation", + "DeequColumnProfilerRelation", + "DeequConstraintSuggestionRelation", + "CheckMessage", + "ConstraintMessage", + "PredicateMessage", + "AnalyzerMessage", + "KLLParameters", + "VerificationStatus", + "CheckStatus", + "ConstraintStatus", + "MetricEntity", +] diff --git a/pydeequ/v2/proto/deequ_connect.proto b/pydeequ/v2/proto/deequ_connect.proto new file mode 100644 index 0000000..e2fe1c1 --- /dev/null +++ b/pydeequ/v2/proto/deequ_connect.proto @@ -0,0 +1,236 @@ +/** + * Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +syntax = "proto3"; + +package com.amazon.deequ.connect; + +option java_multiple_files = true; +option java_package = "com.amazon.deequ.connect.proto"; +option java_outer_classname = "DeequConnectProtos"; + +// ============================================================================ +// Main Request Messages - Used as Spark Connect Relation Extensions +// ============================================================================ + +// Verification request - runs checks and returns results as DataFrame +message DeequVerificationRelation { + // Reference to the input DataFrame (serialized Spark Connect Relation) + bytes input_relation = 1; + + // Checks to run + repeated CheckMessage checks = 2; + + // Additional analyzers to run (beyond those required by checks) + repeated AnalyzerMessage required_analyzers = 3; +} + +// Analysis request - runs analyzers and returns metrics as DataFrame +message DeequAnalysisRelation { + // Reference to the input DataFrame + bytes input_relation = 1; + + // Analyzers to run + repeated AnalyzerMessage analyzers = 2; +} + +// ============================================================================ +// Check Messages +// ============================================================================ + +// Check definition - a named collection of constraints +message CheckMessage { + // Check severity level + enum Level { + ERROR = 0; + WARNING = 1; + } + + Level level = 1; + string description = 2; + repeated ConstraintMessage constraints = 3; +} + +// ============================================================================ +// Constraint Messages +// ============================================================================ + +// Constraint definition - a single data quality rule +message ConstraintMessage { + // Constraint type identifier + string type = 1; + + // Common fields + string column = 2; // Single column name + repeated string columns = 3; // Multiple column names + PredicateMessage assertion = 4; // Assertion predicate + string hint = 5; // Hint message for failures + string where = 6; // SQL WHERE clause filter + + // Type-specific fields + string pattern = 7; // Regex pattern (for hasPattern, containsEmail, etc.) + string column_condition = 8; // SQL condition (for satisfies) + string constraint_name = 9; // Name for custom constraints (for satisfies) + repeated string allowed_values = 10; // Allowed values (for isContainedIn) + + // Numeric parameters + double quantile = 11; // For hasApproxQuantile +} + +// ============================================================================ +// Predicate Messages - Replaces Python Lambda Assertions +// ============================================================================ + +// Predicate for numeric assertions +message PredicateMessage { + enum Operator { + UNSPECIFIED = 0; // Default/unset - used to detect "no predicate" vs "EQ 0.0" + EQ = 1; // == + NE = 2; // != + GT = 3; // > + GE = 4; // >= + LT = 5; // < + LE = 6; // <= + BETWEEN = 7; // lower <= x <= upper + } + + Operator operator = 1; + double value = 2; // For comparison operators + double lower_bound = 3; // For BETWEEN + double upper_bound = 4; // For BETWEEN +} + +// ============================================================================ +// Analyzer Messages +// ============================================================================ + +// Analyzer definition - computes a metric on data +message AnalyzerMessage { + // Analyzer type identifier + string type = 1; + + // Common fields + string column = 2; // Single column name + repeated string columns = 3; // Multiple column names + string where = 4; // SQL WHERE clause filter + + // Type-specific parameters + double quantile = 5; // For ApproxQuantile + double relative_error = 6; // For ApproxQuantile, ApproxCountDistinct + string pattern = 7; // For PatternMatch + int32 max_detail_bins = 8; // For Histogram + + // KLL Sketch parameters + KLLParameters kll_parameters = 9; +} + +// Parameters for KLL Sketch analyzer +message KLLParameters { + int32 sketch_size = 1; + double shrinking_factor = 2; + int32 number_of_buckets = 3; +} + +// ============================================================================ +// Result Messages +// ============================================================================ + +// Verification result status +enum VerificationStatus { + VERIFICATION_SUCCESS = 0; + VERIFICATION_WARNING = 1; + VERIFICATION_ERROR = 2; +} + +// Check result status +enum CheckStatus { + CHECK_SUCCESS = 0; + CHECK_WARNING = 1; + CHECK_ERROR = 2; +} + +// Constraint result status +enum ConstraintStatus { + CONSTRAINT_SUCCESS = 0; + CONSTRAINT_FAILURE = 1; +} + +// Metric entity type +enum MetricEntity { + DATASET = 0; + COLUMN = 1; + MULTICOLUMN = 2; +} + +// ============================================================================ +// Column Profiler Messages +// ============================================================================ + +// Column profiler request - analyzes column distributions and statistics +message DeequColumnProfilerRelation { + // Reference to the input DataFrame (serialized Spark Connect Relation) + bytes input_relation = 1; + + // Restrict profiling to specific columns (empty = all columns) + repeated string restrict_to_columns = 2; + + // Threshold for computing histograms (columns with distinct values <= threshold get histograms) + int32 low_cardinality_histogram_threshold = 3; + + // Enable KLL sketch profiling for approximate quantiles + bool enable_kll_profiling = 4; + + // KLL sketch parameters (only used if enable_kll_profiling is true) + KLLParameters kll_parameters = 5; + + // Predefined data types for columns (column_name -> type_name) + // Supported types: "String", "Integer", "Long", "Double", "Boolean" + map predefined_types = 6; +} + +// ============================================================================ +// Constraint Suggestion Messages +// ============================================================================ + +// Constraint suggestion request - auto-generates data quality rules +message DeequConstraintSuggestionRelation { + // Reference to the input DataFrame (serialized Spark Connect Relation) + bytes input_relation = 1; + + // Constraint rule sets to apply + // Values: "DEFAULT", "STRING", "NUMERICAL", "COMMON", "EXTENDED" + repeated string constraint_rules = 2; + + // Restrict suggestions to specific columns (empty = all columns) + repeated string restrict_to_columns = 3; + + // Threshold for computing histograms + int32 low_cardinality_histogram_threshold = 4; + + // Enable KLL sketch profiling + bool enable_kll_profiling = 5; + + // KLL sketch parameters + KLLParameters kll_parameters = 6; + + // Predefined data types for columns + map predefined_types = 7; + + // Train/test split ratio (0.0 = disabled, 0.0-1.0 = ratio for test set) + double testset_ratio = 8; + + // Random seed for train/test split (0 = no seed) + int64 testset_split_random_seed = 9; +} diff --git a/pydeequ/v2/proto/deequ_connect_pb2.py b/pydeequ/v2/proto/deequ_connect_pb2.py new file mode 100644 index 0000000..61aadf3 --- /dev/null +++ b/pydeequ/v2/proto/deequ_connect_pb2.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: deequ_connect.proto +# Protobuf Python Version: 6.33.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 6, + 33, + 2, + '', + 'deequ_connect.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13\x64\x65\x65qu_connect.proto\x12\x18\x63om.amazon.deequ.connect\"\xb2\x01\n\x19\x44\x65\x65quVerificationRelation\x12\x16\n\x0einput_relation\x18\x01 \x01(\x0c\x12\x36\n\x06\x63hecks\x18\x02 \x03(\x0b\x32&.com.amazon.deequ.connect.CheckMessage\x12\x45\n\x12required_analyzers\x18\x03 \x03(\x0b\x32).com.amazon.deequ.connect.AnalyzerMessage\"m\n\x15\x44\x65\x65quAnalysisRelation\x12\x16\n\x0einput_relation\x18\x01 \x01(\x0c\x12<\n\tanalyzers\x18\x02 \x03(\x0b\x32).com.amazon.deequ.connect.AnalyzerMessage\"\xc3\x01\n\x0c\x43heckMessage\x12;\n\x05level\x18\x01 \x01(\x0e\x32,.com.amazon.deequ.connect.CheckMessage.Level\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12@\n\x0b\x63onstraints\x18\x03 \x03(\x0b\x32+.com.amazon.deequ.connect.ConstraintMessage\"\x1f\n\x05Level\x12\t\n\x05\x45RROR\x10\x00\x12\x0b\n\x07WARNING\x10\x01\"\x8c\x02\n\x11\x43onstraintMessage\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x0e\n\x06\x63olumn\x18\x02 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x03 \x03(\t\x12=\n\tassertion\x18\x04 \x01(\x0b\x32*.com.amazon.deequ.connect.PredicateMessage\x12\x0c\n\x04hint\x18\x05 \x01(\t\x12\r\n\x05where\x18\x06 \x01(\t\x12\x0f\n\x07pattern\x18\x07 \x01(\t\x12\x18\n\x10\x63olumn_condition\x18\x08 \x01(\t\x12\x17\n\x0f\x63onstraint_name\x18\t \x01(\t\x12\x16\n\x0e\x61llowed_values\x18\n \x03(\t\x12\x10\n\x08quantile\x18\x0b \x01(\x01\"\xec\x01\n\x10PredicateMessage\x12\x45\n\x08operator\x18\x01 \x01(\x0e\x32\x33.com.amazon.deequ.connect.PredicateMessage.Operator\x12\r\n\x05value\x18\x02 \x01(\x01\x12\x13\n\x0blower_bound\x18\x03 \x01(\x01\x12\x13\n\x0bupper_bound\x18\x04 \x01(\x01\"X\n\x08Operator\x12\x0f\n\x0bUNSPECIFIED\x10\x00\x12\x06\n\x02\x45Q\x10\x01\x12\x06\n\x02NE\x10\x02\x12\x06\n\x02GT\x10\x03\x12\x06\n\x02GE\x10\x04\x12\x06\n\x02LT\x10\x05\x12\x06\n\x02LE\x10\x06\x12\x0b\n\x07\x42\x45TWEEN\x10\x07\"\xe4\x01\n\x0f\x41nalyzerMessage\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\x0e\n\x06\x63olumn\x18\x02 \x01(\t\x12\x0f\n\x07\x63olumns\x18\x03 \x03(\t\x12\r\n\x05where\x18\x04 \x01(\t\x12\x10\n\x08quantile\x18\x05 \x01(\x01\x12\x16\n\x0erelative_error\x18\x06 \x01(\x01\x12\x0f\n\x07pattern\x18\x07 \x01(\t\x12\x17\n\x0fmax_detail_bins\x18\x08 \x01(\x05\x12?\n\x0ekll_parameters\x18\t \x01(\x0b\x32\'.com.amazon.deequ.connect.KLLParameters\"Y\n\rKLLParameters\x12\x13\n\x0bsketch_size\x18\x01 \x01(\x05\x12\x18\n\x10shrinking_factor\x18\x02 \x01(\x01\x12\x19\n\x11number_of_buckets\x18\x03 \x01(\x05\"\xfc\x02\n\x1b\x44\x65\x65quColumnProfilerRelation\x12\x16\n\x0einput_relation\x18\x01 \x01(\x0c\x12\x1b\n\x13restrict_to_columns\x18\x02 \x03(\t\x12+\n#low_cardinality_histogram_threshold\x18\x03 \x01(\x05\x12\x1c\n\x14\x65nable_kll_profiling\x18\x04 \x01(\x08\x12?\n\x0ekll_parameters\x18\x05 \x01(\x0b\x32\'.com.amazon.deequ.connect.KLLParameters\x12\x64\n\x10predefined_types\x18\x06 \x03(\x0b\x32J.com.amazon.deequ.connect.DeequColumnProfilerRelation.PredefinedTypesEntry\x1a\x36\n\x14PredefinedTypesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xdc\x03\n!DeequConstraintSuggestionRelation\x12\x16\n\x0einput_relation\x18\x01 \x01(\x0c\x12\x18\n\x10\x63onstraint_rules\x18\x02 \x03(\t\x12\x1b\n\x13restrict_to_columns\x18\x03 \x03(\t\x12+\n#low_cardinality_histogram_threshold\x18\x04 \x01(\x05\x12\x1c\n\x14\x65nable_kll_profiling\x18\x05 \x01(\x08\x12?\n\x0ekll_parameters\x18\x06 \x01(\x0b\x32\'.com.amazon.deequ.connect.KLLParameters\x12j\n\x10predefined_types\x18\x07 \x03(\x0b\x32P.com.amazon.deequ.connect.DeequConstraintSuggestionRelation.PredefinedTypesEntry\x12\x15\n\rtestset_ratio\x18\x08 \x01(\x01\x12!\n\x19testset_split_random_seed\x18\t \x01(\x03\x1a\x36\n\x14PredefinedTypesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01*`\n\x12VerificationStatus\x12\x18\n\x14VERIFICATION_SUCCESS\x10\x00\x12\x18\n\x14VERIFICATION_WARNING\x10\x01\x12\x16\n\x12VERIFICATION_ERROR\x10\x02*D\n\x0b\x43heckStatus\x12\x11\n\rCHECK_SUCCESS\x10\x00\x12\x11\n\rCHECK_WARNING\x10\x01\x12\x0f\n\x0b\x43HECK_ERROR\x10\x02*B\n\x10\x43onstraintStatus\x12\x16\n\x12\x43ONSTRAINT_SUCCESS\x10\x00\x12\x16\n\x12\x43ONSTRAINT_FAILURE\x10\x01*8\n\x0cMetricEntity\x12\x0b\n\x07\x44\x41TASET\x10\x00\x12\n\n\x06\x43OLUMN\x10\x01\x12\x0f\n\x0bMULTICOLUMN\x10\x02\x42\x36\n\x1e\x63om.amazon.deequ.connect.protoB\x12\x44\x65\x65quConnectProtosP\x01\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'deequ_connect_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals['DESCRIPTOR']._loaded_options = None + _globals['DESCRIPTOR']._serialized_options = b'\n\036com.amazon.deequ.connect.protoB\022DeequConnectProtosP\001' + _globals['_DEEQUCOLUMNPROFILERRELATION_PREDEFINEDTYPESENTRY']._loaded_options = None + _globals['_DEEQUCOLUMNPROFILERRELATION_PREDEFINEDTYPESENTRY']._serialized_options = b'8\001' + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION_PREDEFINEDTYPESENTRY']._loaded_options = None + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION_PREDEFINEDTYPESENTRY']._serialized_options = b'8\001' + _globals['_VERIFICATIONSTATUS']._serialized_start=2233 + _globals['_VERIFICATIONSTATUS']._serialized_end=2329 + _globals['_CHECKSTATUS']._serialized_start=2331 + _globals['_CHECKSTATUS']._serialized_end=2399 + _globals['_CONSTRAINTSTATUS']._serialized_start=2401 + _globals['_CONSTRAINTSTATUS']._serialized_end=2467 + _globals['_METRICENTITY']._serialized_start=2469 + _globals['_METRICENTITY']._serialized_end=2525 + _globals['_DEEQUVERIFICATIONRELATION']._serialized_start=50 + _globals['_DEEQUVERIFICATIONRELATION']._serialized_end=228 + _globals['_DEEQUANALYSISRELATION']._serialized_start=230 + _globals['_DEEQUANALYSISRELATION']._serialized_end=339 + _globals['_CHECKMESSAGE']._serialized_start=342 + _globals['_CHECKMESSAGE']._serialized_end=537 + _globals['_CHECKMESSAGE_LEVEL']._serialized_start=506 + _globals['_CHECKMESSAGE_LEVEL']._serialized_end=537 + _globals['_CONSTRAINTMESSAGE']._serialized_start=540 + _globals['_CONSTRAINTMESSAGE']._serialized_end=808 + _globals['_PREDICATEMESSAGE']._serialized_start=811 + _globals['_PREDICATEMESSAGE']._serialized_end=1047 + _globals['_PREDICATEMESSAGE_OPERATOR']._serialized_start=959 + _globals['_PREDICATEMESSAGE_OPERATOR']._serialized_end=1047 + _globals['_ANALYZERMESSAGE']._serialized_start=1050 + _globals['_ANALYZERMESSAGE']._serialized_end=1278 + _globals['_KLLPARAMETERS']._serialized_start=1280 + _globals['_KLLPARAMETERS']._serialized_end=1369 + _globals['_DEEQUCOLUMNPROFILERRELATION']._serialized_start=1372 + _globals['_DEEQUCOLUMNPROFILERRELATION']._serialized_end=1752 + _globals['_DEEQUCOLUMNPROFILERRELATION_PREDEFINEDTYPESENTRY']._serialized_start=1698 + _globals['_DEEQUCOLUMNPROFILERRELATION_PREDEFINEDTYPESENTRY']._serialized_end=1752 + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION']._serialized_start=1755 + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION']._serialized_end=2231 + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION_PREDEFINEDTYPESENTRY']._serialized_start=1698 + _globals['_DEEQUCONSTRAINTSUGGESTIONRELATION_PREDEFINEDTYPESENTRY']._serialized_end=1752 +# @@protoc_insertion_point(module_scope) diff --git a/pydeequ/v2/proto/deequ_connect_pb2.pyi b/pydeequ/v2/proto/deequ_connect_pb2.pyi new file mode 100644 index 0000000..b46b22f --- /dev/null +++ b/pydeequ/v2/proto/deequ_connect_pb2.pyi @@ -0,0 +1,216 @@ +from google.protobuf.internal import containers as _containers +from google.protobuf.internal import enum_type_wrapper as _enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from collections.abc import Iterable as _Iterable, Mapping as _Mapping +from typing import ClassVar as _ClassVar, Optional as _Optional, Union as _Union + +DESCRIPTOR: _descriptor.FileDescriptor + +class VerificationStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + VERIFICATION_SUCCESS: _ClassVar[VerificationStatus] + VERIFICATION_WARNING: _ClassVar[VerificationStatus] + VERIFICATION_ERROR: _ClassVar[VerificationStatus] + +class CheckStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + CHECK_SUCCESS: _ClassVar[CheckStatus] + CHECK_WARNING: _ClassVar[CheckStatus] + CHECK_ERROR: _ClassVar[CheckStatus] + +class ConstraintStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + CONSTRAINT_SUCCESS: _ClassVar[ConstraintStatus] + CONSTRAINT_FAILURE: _ClassVar[ConstraintStatus] + +class MetricEntity(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + DATASET: _ClassVar[MetricEntity] + COLUMN: _ClassVar[MetricEntity] + MULTICOLUMN: _ClassVar[MetricEntity] +VERIFICATION_SUCCESS: VerificationStatus +VERIFICATION_WARNING: VerificationStatus +VERIFICATION_ERROR: VerificationStatus +CHECK_SUCCESS: CheckStatus +CHECK_WARNING: CheckStatus +CHECK_ERROR: CheckStatus +CONSTRAINT_SUCCESS: ConstraintStatus +CONSTRAINT_FAILURE: ConstraintStatus +DATASET: MetricEntity +COLUMN: MetricEntity +MULTICOLUMN: MetricEntity + +class DeequVerificationRelation(_message.Message): + __slots__ = () + INPUT_RELATION_FIELD_NUMBER: _ClassVar[int] + CHECKS_FIELD_NUMBER: _ClassVar[int] + REQUIRED_ANALYZERS_FIELD_NUMBER: _ClassVar[int] + input_relation: bytes + checks: _containers.RepeatedCompositeFieldContainer[CheckMessage] + required_analyzers: _containers.RepeatedCompositeFieldContainer[AnalyzerMessage] + def __init__(self, input_relation: _Optional[bytes] = ..., checks: _Optional[_Iterable[_Union[CheckMessage, _Mapping]]] = ..., required_analyzers: _Optional[_Iterable[_Union[AnalyzerMessage, _Mapping]]] = ...) -> None: ... + +class DeequAnalysisRelation(_message.Message): + __slots__ = () + INPUT_RELATION_FIELD_NUMBER: _ClassVar[int] + ANALYZERS_FIELD_NUMBER: _ClassVar[int] + input_relation: bytes + analyzers: _containers.RepeatedCompositeFieldContainer[AnalyzerMessage] + def __init__(self, input_relation: _Optional[bytes] = ..., analyzers: _Optional[_Iterable[_Union[AnalyzerMessage, _Mapping]]] = ...) -> None: ... + +class CheckMessage(_message.Message): + __slots__ = () + class Level(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + ERROR: _ClassVar[CheckMessage.Level] + WARNING: _ClassVar[CheckMessage.Level] + ERROR: CheckMessage.Level + WARNING: CheckMessage.Level + LEVEL_FIELD_NUMBER: _ClassVar[int] + DESCRIPTION_FIELD_NUMBER: _ClassVar[int] + CONSTRAINTS_FIELD_NUMBER: _ClassVar[int] + level: CheckMessage.Level + description: str + constraints: _containers.RepeatedCompositeFieldContainer[ConstraintMessage] + def __init__(self, level: _Optional[_Union[CheckMessage.Level, str]] = ..., description: _Optional[str] = ..., constraints: _Optional[_Iterable[_Union[ConstraintMessage, _Mapping]]] = ...) -> None: ... + +class ConstraintMessage(_message.Message): + __slots__ = () + TYPE_FIELD_NUMBER: _ClassVar[int] + COLUMN_FIELD_NUMBER: _ClassVar[int] + COLUMNS_FIELD_NUMBER: _ClassVar[int] + ASSERTION_FIELD_NUMBER: _ClassVar[int] + HINT_FIELD_NUMBER: _ClassVar[int] + WHERE_FIELD_NUMBER: _ClassVar[int] + PATTERN_FIELD_NUMBER: _ClassVar[int] + COLUMN_CONDITION_FIELD_NUMBER: _ClassVar[int] + CONSTRAINT_NAME_FIELD_NUMBER: _ClassVar[int] + ALLOWED_VALUES_FIELD_NUMBER: _ClassVar[int] + QUANTILE_FIELD_NUMBER: _ClassVar[int] + type: str + column: str + columns: _containers.RepeatedScalarFieldContainer[str] + assertion: PredicateMessage + hint: str + where: str + pattern: str + column_condition: str + constraint_name: str + allowed_values: _containers.RepeatedScalarFieldContainer[str] + quantile: float + def __init__(self, type: _Optional[str] = ..., column: _Optional[str] = ..., columns: _Optional[_Iterable[str]] = ..., assertion: _Optional[_Union[PredicateMessage, _Mapping]] = ..., hint: _Optional[str] = ..., where: _Optional[str] = ..., pattern: _Optional[str] = ..., column_condition: _Optional[str] = ..., constraint_name: _Optional[str] = ..., allowed_values: _Optional[_Iterable[str]] = ..., quantile: _Optional[float] = ...) -> None: ... + +class PredicateMessage(_message.Message): + __slots__ = () + class Operator(int, metaclass=_enum_type_wrapper.EnumTypeWrapper): + __slots__ = () + UNSPECIFIED: _ClassVar[PredicateMessage.Operator] + EQ: _ClassVar[PredicateMessage.Operator] + NE: _ClassVar[PredicateMessage.Operator] + GT: _ClassVar[PredicateMessage.Operator] + GE: _ClassVar[PredicateMessage.Operator] + LT: _ClassVar[PredicateMessage.Operator] + LE: _ClassVar[PredicateMessage.Operator] + BETWEEN: _ClassVar[PredicateMessage.Operator] + UNSPECIFIED: PredicateMessage.Operator + EQ: PredicateMessage.Operator + NE: PredicateMessage.Operator + GT: PredicateMessage.Operator + GE: PredicateMessage.Operator + LT: PredicateMessage.Operator + LE: PredicateMessage.Operator + BETWEEN: PredicateMessage.Operator + OPERATOR_FIELD_NUMBER: _ClassVar[int] + VALUE_FIELD_NUMBER: _ClassVar[int] + LOWER_BOUND_FIELD_NUMBER: _ClassVar[int] + UPPER_BOUND_FIELD_NUMBER: _ClassVar[int] + operator: PredicateMessage.Operator + value: float + lower_bound: float + upper_bound: float + def __init__(self, operator: _Optional[_Union[PredicateMessage.Operator, str]] = ..., value: _Optional[float] = ..., lower_bound: _Optional[float] = ..., upper_bound: _Optional[float] = ...) -> None: ... + +class AnalyzerMessage(_message.Message): + __slots__ = () + TYPE_FIELD_NUMBER: _ClassVar[int] + COLUMN_FIELD_NUMBER: _ClassVar[int] + COLUMNS_FIELD_NUMBER: _ClassVar[int] + WHERE_FIELD_NUMBER: _ClassVar[int] + QUANTILE_FIELD_NUMBER: _ClassVar[int] + RELATIVE_ERROR_FIELD_NUMBER: _ClassVar[int] + PATTERN_FIELD_NUMBER: _ClassVar[int] + MAX_DETAIL_BINS_FIELD_NUMBER: _ClassVar[int] + KLL_PARAMETERS_FIELD_NUMBER: _ClassVar[int] + type: str + column: str + columns: _containers.RepeatedScalarFieldContainer[str] + where: str + quantile: float + relative_error: float + pattern: str + max_detail_bins: int + kll_parameters: KLLParameters + def __init__(self, type: _Optional[str] = ..., column: _Optional[str] = ..., columns: _Optional[_Iterable[str]] = ..., where: _Optional[str] = ..., quantile: _Optional[float] = ..., relative_error: _Optional[float] = ..., pattern: _Optional[str] = ..., max_detail_bins: _Optional[int] = ..., kll_parameters: _Optional[_Union[KLLParameters, _Mapping]] = ...) -> None: ... + +class KLLParameters(_message.Message): + __slots__ = () + SKETCH_SIZE_FIELD_NUMBER: _ClassVar[int] + SHRINKING_FACTOR_FIELD_NUMBER: _ClassVar[int] + NUMBER_OF_BUCKETS_FIELD_NUMBER: _ClassVar[int] + sketch_size: int + shrinking_factor: float + number_of_buckets: int + def __init__(self, sketch_size: _Optional[int] = ..., shrinking_factor: _Optional[float] = ..., number_of_buckets: _Optional[int] = ...) -> None: ... + +class DeequColumnProfilerRelation(_message.Message): + __slots__ = () + class PredefinedTypesEntry(_message.Message): + __slots__ = () + KEY_FIELD_NUMBER: _ClassVar[int] + VALUE_FIELD_NUMBER: _ClassVar[int] + key: str + value: str + def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ... + INPUT_RELATION_FIELD_NUMBER: _ClassVar[int] + RESTRICT_TO_COLUMNS_FIELD_NUMBER: _ClassVar[int] + LOW_CARDINALITY_HISTOGRAM_THRESHOLD_FIELD_NUMBER: _ClassVar[int] + ENABLE_KLL_PROFILING_FIELD_NUMBER: _ClassVar[int] + KLL_PARAMETERS_FIELD_NUMBER: _ClassVar[int] + PREDEFINED_TYPES_FIELD_NUMBER: _ClassVar[int] + input_relation: bytes + restrict_to_columns: _containers.RepeatedScalarFieldContainer[str] + low_cardinality_histogram_threshold: int + enable_kll_profiling: bool + kll_parameters: KLLParameters + predefined_types: _containers.ScalarMap[str, str] + def __init__(self, input_relation: _Optional[bytes] = ..., restrict_to_columns: _Optional[_Iterable[str]] = ..., low_cardinality_histogram_threshold: _Optional[int] = ..., enable_kll_profiling: _Optional[bool] = ..., kll_parameters: _Optional[_Union[KLLParameters, _Mapping]] = ..., predefined_types: _Optional[_Mapping[str, str]] = ...) -> None: ... + +class DeequConstraintSuggestionRelation(_message.Message): + __slots__ = () + class PredefinedTypesEntry(_message.Message): + __slots__ = () + KEY_FIELD_NUMBER: _ClassVar[int] + VALUE_FIELD_NUMBER: _ClassVar[int] + key: str + value: str + def __init__(self, key: _Optional[str] = ..., value: _Optional[str] = ...) -> None: ... + INPUT_RELATION_FIELD_NUMBER: _ClassVar[int] + CONSTRAINT_RULES_FIELD_NUMBER: _ClassVar[int] + RESTRICT_TO_COLUMNS_FIELD_NUMBER: _ClassVar[int] + LOW_CARDINALITY_HISTOGRAM_THRESHOLD_FIELD_NUMBER: _ClassVar[int] + ENABLE_KLL_PROFILING_FIELD_NUMBER: _ClassVar[int] + KLL_PARAMETERS_FIELD_NUMBER: _ClassVar[int] + PREDEFINED_TYPES_FIELD_NUMBER: _ClassVar[int] + TESTSET_RATIO_FIELD_NUMBER: _ClassVar[int] + TESTSET_SPLIT_RANDOM_SEED_FIELD_NUMBER: _ClassVar[int] + input_relation: bytes + constraint_rules: _containers.RepeatedScalarFieldContainer[str] + restrict_to_columns: _containers.RepeatedScalarFieldContainer[str] + low_cardinality_histogram_threshold: int + enable_kll_profiling: bool + kll_parameters: KLLParameters + predefined_types: _containers.ScalarMap[str, str] + testset_ratio: float + testset_split_random_seed: int + def __init__(self, input_relation: _Optional[bytes] = ..., constraint_rules: _Optional[_Iterable[str]] = ..., restrict_to_columns: _Optional[_Iterable[str]] = ..., low_cardinality_histogram_threshold: _Optional[int] = ..., enable_kll_profiling: _Optional[bool] = ..., kll_parameters: _Optional[_Union[KLLParameters, _Mapping]] = ..., predefined_types: _Optional[_Mapping[str, str]] = ..., testset_ratio: _Optional[float] = ..., testset_split_random_seed: _Optional[int] = ...) -> None: ... diff --git a/pydeequ/v2/spark_helpers.py b/pydeequ/v2/spark_helpers.py new file mode 100644 index 0000000..56c72e2 --- /dev/null +++ b/pydeequ/v2/spark_helpers.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +""" +Spark helper functions for PyDeequ v2. + +This module provides helper functions for working with Spark Connect, +including compatibility shims for different Spark versions. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from google.protobuf import any_pb2 + +if TYPE_CHECKING: + from pyspark.sql import DataFrame, SparkSession + from pyspark.sql.connect.plan import LogicalPlan + + +def dataframe_from_plan(plan: "LogicalPlan", session: "SparkSession") -> "DataFrame": + """ + Create a DataFrame from a LogicalPlan, handling Spark version differences. + + Spark 3.x uses DataFrame.withPlan(plan, session) + Spark 4.x uses DataFrame(plan, session) + + Args: + plan: LogicalPlan to create DataFrame from + session: SparkSession + + Returns: + DataFrame wrapping the plan + """ + from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame + + if hasattr(ConnectDataFrame, "withPlan"): + # Spark 3.x + return ConnectDataFrame.withPlan(plan, session=session) + + # Spark 4.x + return ConnectDataFrame(plan, session) + + +def create_deequ_plan(extension: any_pb2.Any) -> "LogicalPlan": + """ + Create a LogicalPlan subclass for Deequ that properly integrates with PySpark. + + We dynamically import and subclass LogicalPlan to avoid import issues + when Spark Connect is not available. + + Args: + extension: Protobuf Any message containing the Deequ operation + + Returns: + LogicalPlan instance for the Deequ operation + """ + import pyspark.sql.connect.proto as spark_proto + from pyspark.sql.connect.plan import LogicalPlan + + class _DeequExtensionPlan(LogicalPlan): + """ + Custom LogicalPlan for Deequ operations via Spark Connect. + + This plan wraps our protobuf message as a Relation extension, + which is sent to the server and handled by DeequRelationPlugin. + """ + + def __init__(self, ext: any_pb2.Any): + # Pass None as child - this is a leaf node + super().__init__(child=None) + self._extension = ext + + def plan(self, session) -> spark_proto.Relation: + """Return the Relation proto for this plan.""" + rel = self._create_proto_relation() + rel.extension.CopyFrom(self._extension) + return rel + + def __repr__(self) -> str: + return "DeequExtensionPlan" + + return _DeequExtensionPlan(extension) + + +__all__ = [ + "dataframe_from_plan", + "create_deequ_plan", +] diff --git a/pydeequ/v2/suggestions.py b/pydeequ/v2/suggestions.py new file mode 100644 index 0000000..b89b07b --- /dev/null +++ b/pydeequ/v2/suggestions.py @@ -0,0 +1,340 @@ +# -*- coding: utf-8 -*- +""" +Constraint Suggestions for Deequ Spark Connect. + +This module provides automatic constraint suggestion capabilities that analyze +DataFrame columns and suggest appropriate data quality constraints based on +the data characteristics. + +Example usage: + from pyspark.sql import SparkSession + from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + + # Basic suggestions with default rules + suggestions = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run()) + + # With train/test evaluation + suggestions = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.EXTENDED) + .useTrainTestSplitWithTestsetRatio(0.2, seed=42) + .run()) + + suggestions.show() # Result is a DataFrame with suggested constraints +""" + +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence + +from google.protobuf import any_pb2 + +from pydeequ.v2.profiles import KLLParameters +from pydeequ.v2.proto import deequ_connect_pb2 as proto +from pydeequ.v2.spark_helpers import create_deequ_plan, dataframe_from_plan + +if TYPE_CHECKING: + from pyspark.sql import DataFrame, SparkSession + + +class Rules(Enum): + """ + Constraint suggestion rule sets. + + Different rule sets analyze different aspects of the data: + + - DEFAULT: Core rules for completeness, type retention, categorical ranges + - STRING: String-specific rules for length constraints + - NUMERICAL: Numeric rules for min/max/mean/stddev + - COMMON: Common patterns like uniqueness + - EXTENDED: All rules combined + """ + + DEFAULT = "DEFAULT" + """Core rules: CompleteIfComplete, RetainCompleteness, RetainType, + CategoricalRange, FractionalCategoricalRange, NonNegativeNumbers""" + + STRING = "STRING" + """String rules: HasMinLength, HasMaxLength""" + + NUMERICAL = "NUMERICAL" + """Numeric rules: HasMin, HasMax, HasMean, HasStandardDeviation""" + + COMMON = "COMMON" + """Common patterns: UniqueIfApproximatelyUnique""" + + EXTENDED = "EXTENDED" + """All rules combined: DEFAULT + STRING + NUMERICAL + COMMON""" + + +class ConstraintSuggestionRunner: + """ + Entry point for generating constraint suggestions. + + ConstraintSuggestionRunner analyzes DataFrame columns to suggest + appropriate data quality constraints based on the data characteristics. + + Example: + suggestions = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run()) + """ + + def __init__(self, spark: "SparkSession"): + """ + Create a new ConstraintSuggestionRunner. + + Args: + spark: SparkSession (can be either local or Spark Connect) + """ + self._spark = spark + + def onData(self, df: "DataFrame") -> "ConstraintSuggestionRunBuilder": + """ + Specify the DataFrame to analyze. + + Args: + df: DataFrame to analyze for constraint suggestions + + Returns: + ConstraintSuggestionRunBuilder for method chaining + """ + return ConstraintSuggestionRunBuilder(self._spark, df) + + +class ConstraintSuggestionRunBuilder: + """ + Builder for configuring and executing a constraint suggestion run. + + This class collects suggestion options and executes the analysis + when run() is called. + """ + + def __init__(self, spark: "SparkSession", df: "DataFrame"): + """ + Create a new ConstraintSuggestionRunBuilder. + + Args: + spark: SparkSession + df: DataFrame to analyze + """ + self._spark = spark + self._df = df + self._rules: List[Rules] = [] + self._restrict_to_columns: Optional[Sequence[str]] = None + self._low_cardinality_threshold: int = 0 + self._enable_kll: bool = False + self._kll_parameters: Optional[KLLParameters] = None + self._predefined_types: Optional[Dict[str, str]] = None + self._testset_ratio: float = 0.0 + self._testset_seed: Optional[int] = None + + def addConstraintRules(self, rules: Rules) -> "ConstraintSuggestionRunBuilder": + """ + Add a constraint rule set. + + Can be called multiple times to add multiple rule sets. + + Args: + rules: Rules enum value specifying which rules to use + + Returns: + self for method chaining + """ + self._rules.append(rules) + return self + + def restrictToColumns( + self, columns: Sequence[str] + ) -> "ConstraintSuggestionRunBuilder": + """ + Restrict suggestions to specific columns. + + Args: + columns: List of column names to analyze + + Returns: + self for method chaining + """ + self._restrict_to_columns = columns + return self + + def withLowCardinalityHistogramThreshold( + self, threshold: int + ) -> "ConstraintSuggestionRunBuilder": + """ + Set threshold for computing histograms during profiling. + + Args: + threshold: Maximum distinct values for histogram computation + + Returns: + self for method chaining + """ + self._low_cardinality_threshold = threshold + return self + + def withKLLProfiling(self) -> "ConstraintSuggestionRunBuilder": + """ + Enable KLL sketch profiling for numeric columns. + + Returns: + self for method chaining + """ + self._enable_kll = True + return self + + def setKLLParameters( + self, params: KLLParameters + ) -> "ConstraintSuggestionRunBuilder": + """ + Set KLL sketch parameters. + + Args: + params: KLLParameters configuration + + Returns: + self for method chaining + """ + self._kll_parameters = params + return self + + def setPredefinedTypes( + self, types: Dict[str, str] + ) -> "ConstraintSuggestionRunBuilder": + """ + Set predefined data types for columns. + + Args: + types: Dictionary mapping column names to type names + + Returns: + self for method chaining + """ + self._predefined_types = types + return self + + def useTrainTestSplitWithTestsetRatio( + self, ratio: float, seed: Optional[int] = None + ) -> "ConstraintSuggestionRunBuilder": + """ + Enable train/test split for evaluating suggestions. + + When enabled, the data is split into training and test sets. + Suggestions are generated from the training set and then + evaluated against the test set. + + Args: + ratio: Fraction of data to use as test set (0.0-1.0) + seed: Optional random seed for reproducibility + + Returns: + self for method chaining + """ + if not 0.0 < ratio < 1.0: + raise ValueError("testset_ratio must be between 0.0 and 1.0 (exclusive)") + self._testset_ratio = ratio + self._testset_seed = seed + return self + + def run(self) -> "DataFrame": + """ + Execute the suggestion analysis and return results as a DataFrame. + + The result DataFrame contains columns: + - column_name: Column the constraint applies to + - constraint_name: Type of constraint (e.g., "Completeness", "IsIn") + - current_value: Current metric value that triggered suggestion + - description: Human-readable description + - suggesting_rule: Rule that generated this suggestion + - code_for_constraint: Python code snippet for the constraint + + If train/test split is enabled: + - evaluation_status: "Success" or "Failure" on test set + - evaluation_metric_value: Actual metric on test set + + Returns: + DataFrame with constraint suggestions + + Raises: + RuntimeError: If the Deequ plugin is not available on the server + ValueError: If no rules have been added + """ + if not self._rules: + raise ValueError( + "At least one constraint rule set must be added. " + "Use .addConstraintRules(Rules.DEFAULT) to add rules." + ) + + # Build the protobuf message + suggestion_msg = self._build_suggestion_message() + + # V2 only supports Spark Connect + return self._run_via_spark_connect(suggestion_msg) + + def _build_suggestion_message(self) -> proto.DeequConstraintSuggestionRelation: + """Build the protobuf suggestion message.""" + msg = proto.DeequConstraintSuggestionRelation() + + # Add constraint rules + for rule in self._rules: + msg.constraint_rules.append(rule.value) + + # Set column restrictions + if self._restrict_to_columns: + msg.restrict_to_columns.extend(self._restrict_to_columns) + + # Set histogram threshold + if self._low_cardinality_threshold > 0: + msg.low_cardinality_histogram_threshold = self._low_cardinality_threshold + + # Set KLL profiling + msg.enable_kll_profiling = self._enable_kll + if self._kll_parameters: + msg.kll_parameters.CopyFrom(self._kll_parameters.to_proto()) + + # Set predefined types + if self._predefined_types: + for col, dtype in self._predefined_types.items(): + msg.predefined_types[col] = dtype + + # Set train/test split + if self._testset_ratio > 0: + msg.testset_ratio = self._testset_ratio + if self._testset_seed is not None: + msg.testset_split_random_seed = self._testset_seed + + return msg + + def _run_via_spark_connect( + self, msg: proto.DeequConstraintSuggestionRelation + ) -> "DataFrame": + """Execute suggestion analysis via Spark Connect plugin.""" + # Get the input DataFrame's plan as serialized bytes + input_plan = self._df._plan.to_proto(self._spark._client) + msg.input_relation = input_plan.root.SerializeToString() + + # Wrap our Deequ message in a google.protobuf.Any + extension = any_pb2.Any() + extension.Pack(msg, type_url_prefix="type.googleapis.com") + + # Create a proper LogicalPlan subclass with the extension + plan = create_deequ_plan(extension) + + # Create DataFrame from the plan (handles Spark 3.x vs 4.x) + return dataframe_from_plan(plan, self._spark) + + +# Export all public symbols +__all__ = [ + "ConstraintSuggestionRunner", + "ConstraintSuggestionRunBuilder", + "Rules", +] diff --git a/pydeequ/v2/verification.py b/pydeequ/v2/verification.py new file mode 100644 index 0000000..c6d8d2f --- /dev/null +++ b/pydeequ/v2/verification.py @@ -0,0 +1,279 @@ +# -*- coding: utf-8 -*- +""" +VerificationSuite for Deequ Spark Connect. + +This module provides the main entry point for running data quality checks +via Spark Connect. It builds protobuf messages and sends them to the +server-side Deequ plugin. + +Example usage: + from pyspark.sql import SparkSession + from pydeequ.v2.verification import VerificationSuite + from pydeequ.v2.checks import Check, CheckLevel + from pydeequ.v2.predicates import gte, eq + + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + + check = (Check(CheckLevel.Error, "Data quality check") + .isComplete("id") + .hasCompleteness("email", gte(0.95))) + + result = (VerificationSuite(spark) + .onData(df) + .addCheck(check) + .run()) + + result.show() # Result is a DataFrame +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List + +from google.protobuf import any_pb2 + +from pydeequ.v2.analyzers import _ConnectAnalyzer +from pydeequ.v2.checks import Check +from pydeequ.v2.proto import deequ_connect_pb2 as proto +from pydeequ.v2.spark_helpers import create_deequ_plan, dataframe_from_plan + +if TYPE_CHECKING: + from pyspark.sql import DataFrame, SparkSession + + +class VerificationSuite: + """ + Main entry point for running data quality verification. + + VerificationSuite allows you to define checks and analyzers to run + on a DataFrame. When run() is called, the checks and analyzers are + serialized to protobuf and sent to the Spark Connect server where + the Deequ plugin executes them. + + Example: + suite = VerificationSuite(spark) + result = (suite + .onData(df) + .addCheck(check) + .run()) + """ + + def __init__(self, spark: "SparkSession"): + """ + Create a new VerificationSuite. + + Args: + spark: SparkSession connected via Spark Connect + """ + self._spark = spark + + def onData(self, df: "DataFrame") -> "VerificationRunBuilder": + """ + Specify the DataFrame to run verification on. + + Args: + df: DataFrame to verify + + Returns: + VerificationRunBuilder for method chaining + """ + return VerificationRunBuilder(self._spark, df) + + +class VerificationRunBuilder: + """ + Builder for configuring and executing a verification run. + + This class collects checks and analyzers, then executes them + when run() is called. + """ + + def __init__(self, spark: "SparkSession", df: "DataFrame"): + """ + Create a new VerificationRunBuilder. + + Args: + spark: SparkSession + df: DataFrame to verify + """ + self._spark = spark + self._df = df + self._checks: List[Check] = [] + self._analyzers: List[_ConnectAnalyzer] = [] + + def addCheck(self, check: Check) -> "VerificationRunBuilder": + """ + Add a check to run. + + Args: + check: Check to add + + Returns: + self for method chaining + """ + self._checks.append(check) + return self + + def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "VerificationRunBuilder": + """ + Add an analyzer to run (in addition to those required by checks). + + Args: + analyzer: Analyzer to add + + Returns: + self for method chaining + """ + self._analyzers.append(analyzer) + return self + + def run(self) -> "DataFrame": + """ + Execute the verification and return results as a DataFrame. + + The result DataFrame contains columns: + - check: Check description + - check_level: Error or Warning + - check_status: Success, Warning, or Error + - constraint: Constraint description + - constraint_status: Success or Failure + - constraint_message: Details about failures + + Returns: + DataFrame with verification results + + Raises: + RuntimeError: If the Deequ plugin is not available on the server + """ + # Build the protobuf message + msg = proto.DeequVerificationRelation() + + # Add checks + for check in self._checks: + msg.checks.append(check.to_proto()) + + # Add required analyzers + for analyzer in self._analyzers: + msg.required_analyzers.append(analyzer.to_proto()) + + # Get the input DataFrame's plan as serialized bytes + # We serialize just the Relation (plan.root), not the full Plan, + # because Scala expects to parse it as a Relation + input_plan = self._df._plan.to_proto(self._spark._client) + msg.input_relation = input_plan.root.SerializeToString() + + # Wrap our Deequ message in a google.protobuf.Any + extension = any_pb2.Any() + extension.Pack(msg, type_url_prefix="type.googleapis.com") + + # Create a proper LogicalPlan subclass with the extension + plan = create_deequ_plan(extension) + + # Create DataFrame from the plan (handles Spark 3.x vs 4.x) + return dataframe_from_plan(plan, self._spark) + + +class AnalysisRunner: + """ + Entry point for running analyzers without checks. + + Use this when you want to compute metrics without defining + pass/fail constraints. + + Example: + from pydeequ.v2.analyzers import Size, Completeness, Mean + + result = (AnalysisRunner(spark) + .onData(df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("email")) + .addAnalyzer(Mean("amount")) + .run()) + """ + + def __init__(self, spark: "SparkSession"): + """ + Create a new AnalysisRunner. + + Args: + spark: SparkSession connected via Spark Connect + """ + self._spark = spark + + def onData(self, df: "DataFrame") -> "AnalysisRunBuilder": + """ + Specify the DataFrame to analyze. + + Args: + df: DataFrame to analyze + + Returns: + AnalysisRunBuilder for method chaining + """ + return AnalysisRunBuilder(self._spark, df) + + +class AnalysisRunBuilder: + """Builder for configuring and executing an analysis run.""" + + def __init__(self, spark: "SparkSession", df: "DataFrame"): + """ + Create a new AnalysisRunBuilder. + + Args: + spark: SparkSession + df: DataFrame to analyze + """ + self._spark = spark + self._df = df + self._analyzers: List[_ConnectAnalyzer] = [] + + def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "AnalysisRunBuilder": + """ + Add an analyzer to run. + + Args: + analyzer: Analyzer to add + + Returns: + self for method chaining + """ + self._analyzers.append(analyzer) + return self + + def run(self) -> "DataFrame": + """ + Execute the analysis and return metrics as DataFrame. + + Returns: + DataFrame with computed metrics + """ + # Build protobuf message + msg = proto.DeequAnalysisRelation() + for analyzer in self._analyzers: + msg.analyzers.append(analyzer.to_proto()) + + # Get the input DataFrame's plan as serialized bytes + # We serialize just the Relation (plan.root), not the full Plan, + # because Scala expects to parse it as a Relation + input_plan = self._df._plan.to_proto(self._spark._client) + msg.input_relation = input_plan.root.SerializeToString() + + # Wrap our Deequ message in a google.protobuf.Any + extension = any_pb2.Any() + extension.Pack(msg, type_url_prefix="type.googleapis.com") + + # Create a proper LogicalPlan subclass with the extension + plan = create_deequ_plan(extension) + + # Create DataFrame from the plan (handles Spark 3.x vs 4.x) + return dataframe_from_plan(plan, self._spark) + + +# Export all public symbols +__all__ = [ + "VerificationSuite", + "VerificationRunBuilder", + "AnalysisRunner", + "AnalysisRunBuilder", +] diff --git a/pyproject.toml b/pyproject.toml index 0a2fafa..8168444 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pydeequ" -version = "1.5.0" +version = "2.0.0b1" description = "PyDeequ - Unit Tests for Data" authors = ["Chenyang Liu ", "Rahul Sharma "] maintainers = ["Chenyang Liu ","Rahul Sharma "] @@ -28,27 +28,22 @@ classifiers = [ [tool.poetry.dependencies] -python = ">=3.8,<4" -numpy = ">=1.14.1" -pandas = ">=0.23.0" -pyspark = { version = ">=2.4.7, <3.3.0", optional = true } +python = ">=3.9,<4" +numpy = ">=1.23.0" +pandas = ">=1.5.0" +protobuf = ">=4.21.0" +setuptools = ">=69.0.0" # Required for Python 3.12+ (distutils removed) +pyspark = {version = "3.5.0", extras = ["connect"]} -[tool.poetry.dev-dependencies] -pytest = "^6.2.4" -pytest-cov = "^2.11.1" -coverage = "^5.5" -pytest-runner = "^5.3.0" -black = "^21.5b1" -flake8 = "^3.9.2" -flake8-docstrings = "^1.6.0" -pytest-flake8 = "^1.0.7" -pre-commit = "^2.12.1" -pytest-rerunfailures = "^9.1.1" -twine = "^3.4.1" -safety = "^1.10.3" +[tool.poetry.group.dev.dependencies] +pytest = "^8.0.0" +pytest-cov = "^4.1.0" +coverage = "^7.4.0" +black = "^24.0.0" +pre-commit = "^3.6.0" +pytest-rerunfailures = "^14.0" [tool.poetry.extras] -pyspark = ["pyspark"] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/tests/conftest.py b/tests/conftest.py index 34926a4..543a27e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,22 +1,80 @@ # -*- coding: utf-8 -*- -# pylint: disable=redefined-outer-name -import logging +""" +Pytest configuration for PyDeequ tests using Spark Connect. -from pydeequ import deequ_maven_coord, f2j_maven_coord +All tests use the Spark Connect server which must be running before tests. +Start it with: scripts/start-spark-connect.sh +""" +import os +import pytest +from pyspark.sql import SparkSession -# @pytest.yield_fixture(autouse=True) + +# Set environment variables required for pydeequ +os.environ.setdefault("SPARK_VERSION", "3.5") + + +def create_spark_connect_session() -> SparkSession: + """ + Create a Spark Connect session for testing. + + Requires Spark Connect server to be running on localhost:15002. + Start the server with the Deequ plugin loaded. + + Returns: + SparkSession connected to Spark Connect server + """ + return SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + + +@pytest.fixture(scope="module") +def spark() -> SparkSession: + """ + Pytest fixture providing a Spark Connect session. + + The session is shared within each test module for efficiency. + + Yields: + SparkSession for testing + """ + session = create_spark_connect_session() + yield session + session.stop() + + +# Alias for backward compatibility with existing tests +spark_session = spark + + +# Legacy function for unittest-based tests def setup_pyspark(): - from pyspark.sql import SparkSession - - return ( - SparkSession.builder.master("local[*]") - .config("spark.executor.memory", "2g") - .config("spark.jars.packages", deequ_maven_coord) - .config("spark.pyspark.python", "/usr/bin/python3") - .config("spark.pyspark.driver.python", "/usr/bin/python3") - .config("spark.jars.excludes", f2j_maven_coord) - .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") - .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") - .config("spark.sql.autoBroadcastJoinThreshold", "-1") - ) + """ + Legacy setup function for unittest-based tests. + + Returns a SparkSession builder configured for Spark Connect. + This is used by existing unittest classes that call setup_pyspark().getOrCreate(). + """ + + class SparkConnectBuilder: + """Builder that creates Spark Connect sessions.""" + + def __init__(self): + self._app_name = "pydeequ-test" + + def appName(self, name): + self._app_name = name + return self + + def master(self, master): + # Ignored - we always use Spark Connect + return self + + def config(self, key, value): + # Ignored - Spark Connect doesn't need these configs + return self + + def getOrCreate(self): + return get_spark_connect_session() + + return SparkConnectBuilder() diff --git a/tests/v2/__init__.py b/tests/v2/__init__.py new file mode 100644 index 0000000..386bcea --- /dev/null +++ b/tests/v2/__init__.py @@ -0,0 +1 @@ +# PyDeequ v2 tests using Spark Connect diff --git a/tests/v2/conftest.py b/tests/v2/conftest.py new file mode 100644 index 0000000..0474335 --- /dev/null +++ b/tests/v2/conftest.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +""" +Pytest configuration for PyDeequ v2 tests using Spark Connect. + +Requirements: +- Spark Connect server running on localhost:15002 +- Deequ plugin loaded on the server + +Start server with: + $SPARK_HOME/sbin/start-connect-server.sh \ + --jars /path/to/deequ-2.0.9-spark-3.5.jar \ + --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin + +Run tests with: + SPARK_REMOTE=sc://localhost:15002 pytest tests/v2/ -v +""" + +import os + +import pytest +from pyspark.sql import Row, SparkSession + +@pytest.fixture(scope="session") +def spark(): + """ + Session-scoped Spark Connect session. + Shared across all tests for efficiency. + """ + remote_url = os.environ.get("SPARK_REMOTE", "sc://localhost:15002") + session = SparkSession.builder.remote(remote_url).getOrCreate() + yield session + session.stop() + + +@pytest.fixture(scope="module") +def sample_df(spark): + """ + Sample DataFrame used across multiple tests. + + Schema: + - a: string (complete) + - b: int (complete, unique: 1,2,3) + - c: int (has null) + - d: int (all same value: 5) + """ + return spark.createDataFrame( + [ + Row(a="foo", b=1, c=5, d=5), + Row(a="bar", b=2, c=6, d=5), + Row(a="baz", b=3, c=None, d=5), + ] + ) + + +@pytest.fixture(scope="module") +def extended_df(spark): + """ + Extended DataFrame with more columns for comprehensive tests. + """ + return spark.createDataFrame( + [ + Row( + a="foo", + b=1, + c=5, + d=5, + e=3, + f=1, + g="a", + email="foo@example.com", + creditCard="5130566665286573", + ), + Row( + a="bar", + b=2, + c=6, + d=5, + e=2, + f=2, + g="b", + email="bar@example.com", + creditCard="4532677117740914", + ), + Row( + a="baz", + b=3, + c=None, + d=5, + e=1, + f=1, + g=None, + email="baz@example.com", + creditCard="340145324521741", + ), + ] + ) + + +@pytest.fixture(scope="module") +def profiler_df(spark): + """ + DataFrame with varied data types for Column Profiler testing. + + Schema: + - id: int (complete, unique) + - name: string (complete) + - age: int (has 1 null) + - salary: double (has 1 null) + - active: boolean (complete) + - email: string (has 1 null) + - score: double (has 1 null) + """ + return spark.createDataFrame( + [ + Row(id=1, name="Alice", age=30, salary=75000.0, active=True, + email="alice@example.com", score=85.5), + Row(id=2, name="Bob", age=25, salary=65000.0, active=True, + email="bob@example.com", score=92.0), + Row(id=3, name="Charlie", age=35, salary=None, active=False, + email=None, score=78.5), + Row(id=4, name="Diana", age=28, salary=80000.0, active=True, + email="diana@example.com", score=95.0), + Row(id=5, name="Eve", age=None, salary=70000.0, active=True, + email="eve@example.com", score=None), + Row(id=6, name="Frank", age=45, salary=95000.0, active=True, + email="frank@example.com", score=88.0), + Row(id=7, name="Grace", age=32, salary=72000.0, active=False, + email="grace@example.com", score=91.5), + Row(id=8, name="Henry", age=29, salary=68000.0, active=True, + email="henry@example.com", score=82.0), + ] + ) + + +@pytest.fixture(scope="module") +def suggestion_df(spark): + """ + DataFrame designed to trigger specific constraint suggestions. + + Characteristics: + - id: complete and unique -> should suggest NotNull + Unique + - status: categorical (3 values) -> should suggest IsIn + - score: numeric range -> should suggest Min/Max + - category: categorical (3 values) -> should suggest IsIn + """ + return spark.createDataFrame( + [ + Row(id=1, status="active", score=85, category="A"), + Row(id=2, status="active", score=92, category="B"), + Row(id=3, status="inactive", score=78, category="A"), + Row(id=4, status="active", score=95, category="C"), + Row(id=5, status="pending", score=88, category="B"), + Row(id=6, status="active", score=91, category="A"), + Row(id=7, status="inactive", score=82, category="C"), + Row(id=8, status="active", score=89, category="B"), + ] + ) + + +@pytest.fixture(scope="module") +def e2e_df(spark): + """ + DataFrame for end-to-end testing with realistic data. + + Characteristics: + - Mixed data types (int, string, double) + - Some null values + - Valid email patterns + - Range of numeric values + """ + return spark.createDataFrame( + [ + Row(id=1, name="Alice", email="alice@example.com", age=30, score=85.5), + Row(id=2, name="Bob", email="bob@example.com", age=25, score=92.0), + Row(id=3, name="Charlie", email=None, age=35, score=78.5), + Row(id=4, name="Diana", email="diana@example.com", age=28, score=95.0), + Row(id=5, name="Eve", email="eve@example.com", age=None, score=88.0), + ] + ) diff --git a/tests/v2/test_analyzers.py b/tests/v2/test_analyzers.py new file mode 100644 index 0000000..89da7c4 --- /dev/null +++ b/tests/v2/test_analyzers.py @@ -0,0 +1,339 @@ +# -*- coding: utf-8 -*- +""" +Tests for Analyzers using Spark Connect. + +These tests verify the core analyzer functionality of PyDeequ v2. +""" + +import pytest +from pyspark.sql import Row + +from pydeequ.v2.verification import AnalysisRunner +from pydeequ.v2.analyzers import ( + Size, + Completeness, + Mean, + Sum, + Minimum, + Maximum, + StandardDeviation, + ApproxCountDistinct, + Distinctness, + Uniqueness, + UniqueValueRatio, + Entropy, + MinLength, + MaxLength, + Correlation, + ApproxQuantile, + PatternMatch, + Compliance, +) + + +class TestBasicAnalyzers: + """Test basic analyzer types.""" + + def test_size(self, spark, sample_df): + """Test Size analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Size()).run() + + rows = result.collect() + size_row = [r for r in rows if r["name"] == "Size"][0] + assert size_row["value"] == 3.0 + + def test_completeness(self, spark, sample_df): + """Test Completeness analyzer on complete column.""" + result = ( + AnalysisRunner(spark).onData(sample_df).addAnalyzer(Completeness("a")).run() + ) + + rows = result.collect() + assert rows[0]["value"] == 1.0 + + def test_completeness_with_nulls(self, spark, sample_df): + """Test Completeness analyzer on column with nulls.""" + result = ( + AnalysisRunner(spark).onData(sample_df).addAnalyzer(Completeness("c")).run() + ) + + rows = result.collect() + assert abs(rows[0]["value"] - 2 / 3) < 0.001 + + def test_mean(self, spark, sample_df): + """Test Mean analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Mean("b")).run() + + rows = result.collect() + assert rows[0]["value"] == 2.0 + + def test_sum(self, spark, sample_df): + """Test Sum analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Sum("b")).run() + + rows = result.collect() + assert rows[0]["value"] == 6.0 + + def test_minimum(self, spark, sample_df): + """Test Minimum analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Minimum("b")).run() + + rows = result.collect() + assert rows[0]["value"] == 1.0 + + def test_maximum(self, spark, sample_df): + """Test Maximum analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Maximum("b")).run() + + rows = result.collect() + assert rows[0]["value"] == 3.0 + + def test_standard_deviation(self, spark, sample_df): + """Test StandardDeviation analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(StandardDeviation("b")) + .run() + ) + + rows = result.collect() + # std of [1,2,3] is approximately 0.816 + assert abs(rows[0]["value"] - 0.816496580927726) < 0.001 + + +class TestDistinctnessAnalyzers: + """Test distinctness-related analyzers.""" + + def test_approx_count_distinct(self, spark, sample_df): + """Test ApproxCountDistinct analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(ApproxCountDistinct("b")) + .run() + ) + + rows = result.collect() + assert rows[0]["value"] == 3.0 + + def test_distinctness(self, spark, sample_df): + """Test Distinctness analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Distinctness(["b"])) + .run() + ) + + rows = result.collect() + assert rows[0]["value"] == 1.0 # All values are distinct + + def test_distinctness_non_unique(self, spark, sample_df): + """Test Distinctness analyzer on non-unique column.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Distinctness(["d"])) + .run() + ) + + rows = result.collect() + # Column d has all same values, so 1 distinct / 3 rows = 1/3 + assert abs(rows[0]["value"] - 1 / 3) < 0.001 + + def test_uniqueness(self, spark, sample_df): + """Test Uniqueness analyzer.""" + result = ( + AnalysisRunner(spark).onData(sample_df).addAnalyzer(Uniqueness(["b"])).run() + ) + + rows = result.collect() + assert rows[0]["value"] == 1.0 + + def test_unique_value_ratio(self, spark, sample_df): + """Test UniqueValueRatio analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(UniqueValueRatio(["b"])) + .run() + ) + + rows = result.collect() + assert rows[0]["value"] == 1.0 + + +class TestStringAnalyzers: + """Test string-related analyzers.""" + + def test_min_length(self, spark, sample_df): + """Test MinLength analyzer.""" + result = ( + AnalysisRunner(spark).onData(sample_df).addAnalyzer(MinLength("a")).run() + ) + + rows = result.collect() + # "foo", "bar", "baz" all have length 3 + assert rows[0]["value"] == 3.0 + + def test_max_length(self, spark, sample_df): + """Test MaxLength analyzer.""" + result = ( + AnalysisRunner(spark).onData(sample_df).addAnalyzer(MaxLength("a")).run() + ) + + rows = result.collect() + assert rows[0]["value"] == 3.0 + + def test_pattern_match(self, spark, sample_df): + """Test PatternMatch analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(PatternMatch("a", r"ba.*")) + .run() + ) + + rows = result.collect() + # "bar" and "baz" match, "foo" doesn't = 2/3 + assert abs(rows[0]["value"] - 2 / 3) < 0.001 + + +class TestStatisticalAnalyzers: + """Test statistical analyzers.""" + + def test_entropy(self, spark, sample_df): + """Test Entropy analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Entropy("a")).run() + + rows = result.collect() + # 3 distinct values with equal frequency -> log(3) ~ 1.099 + assert abs(rows[0]["value"] - 1.0986122886681096) < 0.001 + + def test_correlation(self, spark, sample_df): + """Test Correlation analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Correlation("b", "c")) + .run() + ) + + rows = result.collect() + # b=[1,2,3], c=[5,6,None] -> perfect correlation on non-null pairs + assert rows[0]["value"] == 1.0 + + def test_approx_quantile(self, spark, sample_df): + """Test ApproxQuantile analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(ApproxQuantile("b", 0.5)) + .run() + ) + + rows = result.collect() + # Median of [1,2,3] is 2 + assert rows[0]["value"] == 2.0 + + def test_compliance(self, spark, sample_df): + """Test Compliance analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Compliance("positive_b", "b > 0")) + .run() + ) + + rows = result.collect() + # All values are positive + assert rows[0]["value"] == 1.0 + + +class TestMultipleAnalyzers: + """Test running multiple analyzers together.""" + + def test_multiple_analyzers(self, spark, sample_df): + """Test running multiple analyzers in one run.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("a")) + .addAnalyzer(Mean("b")) + .addAnalyzer(Maximum("b")) + .addAnalyzer(Minimum("b")) + .run() + ) + + rows = result.collect() + + # Check we got results for all analyzers + names = [r["name"] for r in rows] + assert "Size" in names + assert "Completeness" in names + assert "Mean" in names + assert "Maximum" in names + assert "Minimum" in names + + def test_multiple_completeness(self, spark, sample_df): + """Test Completeness on multiple columns.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Completeness("a")) + .addAnalyzer(Completeness("b")) + .addAnalyzer(Completeness("c")) + .run() + ) + + rows = result.collect() + values = {r["instance"]: r["value"] for r in rows} + + assert values["a"] == 1.0 + assert values["b"] == 1.0 + assert abs(values["c"] - 2 / 3) < 0.001 + + +class TestAnalyzerWithWhere: + """Test analyzers with where clause filtering.""" + + def test_size_with_where(self, spark, sample_df): + """Test Size analyzer with where clause.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Size(where="b > 1")) + .run() + ) + + rows = result.collect() + # Only rows where b > 1 (b=2 and b=3) = 2 rows + assert rows[0]["value"] == 2.0 + + def test_completeness_with_where(self, spark, sample_df): + """Test Completeness analyzer with where clause.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Completeness("c", where="b <= 2")) + .run() + ) + + rows = result.collect() + # Rows where b <= 2: (b=1, c=5), (b=2, c=6) -> both have c values + assert rows[0]["value"] == 1.0 + + def test_mean_with_where(self, spark, sample_df): + """Test Mean analyzer with where clause.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Mean("b", where="b > 1")) + .run() + ) + + rows = result.collect() + # Mean of [2, 3] = 2.5 + assert rows[0]["value"] == 2.5 diff --git a/tests/v2/test_checks.py b/tests/v2/test_checks.py new file mode 100644 index 0000000..a92a81b --- /dev/null +++ b/tests/v2/test_checks.py @@ -0,0 +1,320 @@ +# -*- coding: utf-8 -*- +""" +Tests for Check constraints using Spark Connect. + +These tests verify the core constraint functionality of PyDeequ v2. +""" + +import pytest +from pyspark.sql import Row + +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import VerificationSuite +from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between + + +class TestCheckConstraints: + """Test individual constraint types.""" + + def test_hasSize(self, spark, sample_df): + """Test hasSize constraint.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(3)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) == 1 + assert rows[0]["constraint_status"] == "Success" + + def test_hasSize_failure(self, spark, sample_df): + """Test hasSize constraint failure.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(5)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Failure" + + def test_isComplete(self, spark, sample_df): + """Test isComplete constraint on complete column.""" + check = Check(CheckLevel.Error, "completeness check").isComplete("a") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_isComplete_failure(self, spark, sample_df): + """Test isComplete constraint on incomplete column.""" + check = Check(CheckLevel.Error, "completeness check").isComplete("c") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Failure" + + def test_hasCompleteness(self, spark, sample_df): + """Test hasCompleteness with threshold.""" + # Column c has 2/3 completeness + check = Check(CheckLevel.Error, "completeness check").hasCompleteness( + "c", gte(0.5) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasCompleteness_failure(self, spark, sample_df): + """Test hasCompleteness failure.""" + check = Check(CheckLevel.Error, "completeness check").hasCompleteness( + "c", gte(0.9) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Failure" + + def test_isUnique(self, spark, sample_df): + """Test isUnique constraint.""" + check = Check(CheckLevel.Error, "uniqueness check").isUnique("b") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_isUnique_failure(self, spark, sample_df): + """Test isUnique constraint failure on non-unique column.""" + check = Check(CheckLevel.Error, "uniqueness check").isUnique("d") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Failure" + + def test_hasUniqueness(self, spark, sample_df): + """Test hasUniqueness with multiple columns.""" + check = Check(CheckLevel.Error, "uniqueness check").hasUniqueness( + ["a", "b"], eq(1.0) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasMin(self, spark, sample_df): + """Test hasMin constraint.""" + check = Check(CheckLevel.Error, "min check").hasMin("b", eq(1.0)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasMax(self, spark, sample_df): + """Test hasMax constraint.""" + check = Check(CheckLevel.Error, "max check").hasMax("b", eq(3.0)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasMean(self, spark, sample_df): + """Test hasMean constraint.""" + check = Check(CheckLevel.Error, "mean check").hasMean("b", eq(2.0)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasSum(self, spark, sample_df): + """Test hasSum constraint.""" + check = Check(CheckLevel.Error, "sum check").hasSum("b", eq(6.0)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_hasStandardDeviation(self, spark, sample_df): + """Test hasStandardDeviation constraint.""" + # std of [1,2,3] is ~0.816 + check = Check(CheckLevel.Error, "std check").hasStandardDeviation( + "b", between(0.8, 0.9) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + +class TestCheckChaining: + """Test chaining multiple constraints.""" + + def test_multiple_constraints_all_pass(self, spark, sample_df): + """Test multiple constraints that all pass.""" + check = ( + Check(CheckLevel.Error, "multi check") + .hasSize(eq(3)) + .isComplete("a") + .isUnique("b") + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) == 3 + assert all(row["constraint_status"] == "Success" for row in rows) + + def test_multiple_constraints_some_fail(self, spark, sample_df): + """Test multiple constraints with some failures.""" + check = ( + Check(CheckLevel.Error, "multi check") + .hasSize(eq(3)) # pass + .isComplete("c") # fail (has null) + .isUnique("d") + ) # fail (all same value) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) == 3 + statuses = [row["constraint_status"] for row in rows] + assert statuses.count("Success") == 1 + assert statuses.count("Failure") == 2 + + +class TestCheckLevels: + """Test check level (Error vs Warning).""" + + def test_error_level(self, spark, sample_df): + """Test Error level check.""" + check = Check(CheckLevel.Error, "error check").hasSize(eq(3)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["check_level"] == "Error" + + def test_warning_level(self, spark, sample_df): + """Test Warning level check.""" + check = Check(CheckLevel.Warning, "warning check").hasSize(eq(3)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["check_level"] == "Warning" + + +class TestPredicates: + """Test different predicate types.""" + + def test_eq_predicate(self, spark, sample_df): + """Test eq (equals) predicate.""" + check = Check(CheckLevel.Error, "eq test").hasSize(eq(3)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_gt_predicate(self, spark, sample_df): + """Test gt (greater than) predicate.""" + check = Check(CheckLevel.Error, "gt test").hasSize(gt(2)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_gte_predicate(self, spark, sample_df): + """Test gte (greater than or equal) predicate.""" + check = Check(CheckLevel.Error, "gte test").hasSize(gte(3)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_lt_predicate(self, spark, sample_df): + """Test lt (less than) predicate.""" + check = Check(CheckLevel.Error, "lt test").hasSize(lt(4)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_lte_predicate(self, spark, sample_df): + """Test lte (less than or equal) predicate.""" + check = Check(CheckLevel.Error, "lte test").hasSize(lte(3)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_between_predicate(self, spark, sample_df): + """Test between predicate.""" + check = Check(CheckLevel.Error, "between test").hasSize(between(2, 4)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + +class TestAdditionalConstraints: + """Test additional constraint types.""" + + def test_areComplete(self, spark, sample_df): + """Test areComplete constraint.""" + check = Check(CheckLevel.Error, "are complete").areComplete(["a", "b"]) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_hasDistinctness(self, spark, sample_df): + """Test hasDistinctness constraint.""" + # Column b has 3 distinct values out of 3 rows = 1.0 distinctness + check = Check(CheckLevel.Error, "distinctness").hasDistinctness(["b"], eq(1.0)) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_hasApproxCountDistinct(self, spark, sample_df): + """Test hasApproxCountDistinct constraint.""" + check = Check(CheckLevel.Error, "approx count").hasApproxCountDistinct( + "b", eq(3.0) + ) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_satisfies(self, spark, sample_df): + """Test satisfies constraint with SQL expression.""" + check = Check(CheckLevel.Error, "satisfies").satisfies( + "b > 0", "positive_b", eq(1.0) + ) + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_hasPattern(self, spark, extended_df): + """Test hasPattern constraint.""" + # All emails match the pattern + check = Check(CheckLevel.Error, "pattern").hasPattern( + "email", r".*@.*\.com", eq(1.0) + ) + result = VerificationSuite(spark).onData(extended_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_containsEmail(self, spark, extended_df): + """Test containsEmail constraint.""" + check = Check(CheckLevel.Error, "email").containsEmail("email", eq(1.0)) + result = VerificationSuite(spark).onData(extended_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_containsCreditCardNumber(self, spark, extended_df): + """Test containsCreditCardNumber constraint.""" + check = Check(CheckLevel.Error, "credit card").containsCreditCardNumber( + "creditCard", eq(1.0) + ) + result = VerificationSuite(spark).onData(extended_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_isNonNegative(self, spark, sample_df): + """Test isNonNegative constraint.""" + check = Check(CheckLevel.Error, "non negative").isNonNegative("b") + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" + + def test_isPositive(self, spark, sample_df): + """Test isPositive constraint.""" + check = Check(CheckLevel.Error, "positive").isPositive("b") + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + assert result.collect()[0]["constraint_status"] == "Success" diff --git a/tests/v2/test_e2e_spark_connect.py b/tests/v2/test_e2e_spark_connect.py new file mode 100644 index 0000000..58c18fd --- /dev/null +++ b/tests/v2/test_e2e_spark_connect.py @@ -0,0 +1,647 @@ +# -*- coding: utf-8 -*- +""" +End-to-End tests for PyDeequ via Spark Connect. + +These tests verify that the full Spark Connect pipeline works correctly, +from Python client through the gRPC protocol to the Scala DeequRelationPlugin. + +Prerequisites: +1. Build the Deequ JAR with Spark Connect plugin: + cd deequ && mvn package -DskipTests + +2. Start Spark Connect server with the plugin: + ./scripts/start-spark-connect.sh + +3. Run these tests: + SPARK_REMOTE=sc://localhost:15002 pytest tests/test_e2e_spark_connect.py -v + +Note: These tests do NOT use Py4J fallback - they test the actual Spark Connect +protocol with the DeequRelationPlugin on the server side. +""" + +import os + +import pytest +from pyspark.sql import Row + +from pydeequ.v2.analyzers import ( + Completeness, + Distinctness, + Maximum, + Mean, + Minimum, + Size, + StandardDeviation, + Uniqueness, +) +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.predicates import between, eq, gt, gte, is_one, lt, lte +from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + +# Import the new Spark Connect API +from pydeequ.v2.verification import AnalysisRunner, VerificationSuite + +# Skip all tests if SPARK_REMOTE is not set +pytestmark = pytest.mark.skipif( + "SPARK_REMOTE" not in os.environ, + reason="SPARK_REMOTE environment variable not set. Start Spark Connect server first.", +) + + +# Note: spark fixture is defined in conftest.py (session-scoped) + + +@pytest.fixture(scope="module") +def sample_df(e2e_df): + """ + Alias for e2e_df from conftest.py. + + Schema: id (int), name (string), email (string), age (int), score (double) + - 5 rows total + - email has 1 null (80% complete) + - age has 1 null (80% complete) + """ + return e2e_df + + +class TestVerificationSuiteE2E: + """End-to-end tests for VerificationSuite via Spark Connect.""" + + def test_size_check(self, spark, sample_df): + """Test that hasSize check works via Spark Connect.""" + check = Check(CheckLevel.Error, "Size check").hasSize(eq(5)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + # Result should be a DataFrame + assert result is not None + + # Collect results + rows = result.collect() + assert len(rows) > 0 + + # Check should pass (we have exactly 5 rows) + row = rows[0] + assert row["constraint_status"] == "Success" + + def test_completeness_check_passing(self, spark, sample_df): + """Test completeness check that should pass.""" + check = ( + Check(CheckLevel.Error, "Completeness check") + .isComplete("id") + .isComplete("name") + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + + # Both constraints should pass (id and name are complete) + for row in rows: + assert row["constraint_status"] == "Success" + + def test_completeness_check_failing(self, spark, sample_df): + """Test completeness check that should fail.""" + check = Check(CheckLevel.Error, "Completeness check").isComplete( + "email" + ) # email has NULL values + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + + # Should fail because email has NULL + assert len(rows) > 0 + assert rows[0]["constraint_status"] == "Failure" + + def test_has_completeness_with_threshold(self, spark, sample_df): + """Test hasCompleteness with a threshold.""" + # email is 80% complete (4 out of 5) + check = Check(CheckLevel.Warning, "Completeness threshold").hasCompleteness( + "email", gte(0.8) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) > 0 + assert rows[0]["constraint_status"] == "Success" + + def test_uniqueness_check(self, spark, sample_df): + """Test uniqueness check.""" + check = Check(CheckLevel.Error, "Uniqueness check").isUnique("id") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) > 0 + assert rows[0]["constraint_status"] == "Success" + + def test_mean_check(self, spark, sample_df): + """Test mean check with range assertion.""" + # Mean age should be around 29.5 (average of 30, 25, 35, 28, NULL) + check = Check(CheckLevel.Error, "Mean check").hasMean( + "score", between(80.0, 95.0) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) > 0 + assert rows[0]["constraint_status"] == "Success" + + def test_multiple_checks(self, spark, sample_df): + """Test multiple checks in a single verification run.""" + check = ( + Check(CheckLevel.Error, "Multiple checks") + .hasSize(eq(5)) + .isComplete("id") + .isComplete("name") + .isUnique("id") + .hasCompleteness("email", gte(0.7)) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + + # All 5 constraints should pass + assert len(rows) == 5 + for row in rows: + assert row["constraint_status"] == "Success" + + def test_check_levels(self, spark, sample_df): + """Test both Error and Warning check levels.""" + error_check = Check(CheckLevel.Error, "Error level check").isComplete("id") + + warning_check = Check(CheckLevel.Warning, "Warning level check").isComplete( + "email" + ) # Will fail + + result = ( + VerificationSuite(spark) + .onData(sample_df) + .addCheck(error_check) + .addCheck(warning_check) + .run() + ) + + rows = result.collect() + + # Find the results for each check + error_result = [r for r in rows if r["check"] == "Error level check"][0] + warning_result = [r for r in rows if r["check"] == "Warning level check"][0] + + assert error_result["check_level"] == "Error" + assert error_result["constraint_status"] == "Success" + + assert warning_result["check_level"] == "Warning" + assert warning_result["constraint_status"] == "Failure" + + +class TestAnalysisRunnerE2E: + """End-to-end tests for AnalysisRunner via Spark Connect.""" + + def test_size_analyzer(self, spark, sample_df): + """Test Size analyzer.""" + result = AnalysisRunner(spark).onData(sample_df).addAnalyzer(Size()).run() + + rows = result.collect() + assert len(rows) > 0 + + # Find the Size metric + size_row = [r for r in rows if r["name"] == "Size"][0] + assert float(size_row["value"]) == 5.0 + + def test_completeness_analyzer(self, spark, sample_df): + """Test Completeness analyzer.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Completeness("id")) + .addAnalyzer(Completeness("email")) + .run() + ) + + rows = result.collect() + + # id should be 100% complete + id_row = [r for r in rows if r["instance"] == "id"][0] + assert float(id_row["value"]) == 1.0 + + # email should be 80% complete + email_row = [r for r in rows if r["instance"] == "email"][0] + assert float(email_row["value"]) == 0.8 + + def test_statistical_analyzers(self, spark, sample_df): + """Test statistical analyzers (Mean, Min, Max, StdDev).""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Mean("score")) + .addAnalyzer(Minimum("score")) + .addAnalyzer(Maximum("score")) + .addAnalyzer(StandardDeviation("score")) + .run() + ) + + rows = result.collect() + + # Extract values by metric name + metrics = {r["name"]: float(r["value"]) for r in rows} + + # Verify expected ranges + assert 85.0 <= metrics["Mean"] <= 90.0 # Mean of scores + assert metrics["Minimum"] == 78.5 + assert metrics["Maximum"] == 95.0 + assert metrics["StandardDeviation"] > 0 # Should have some variance + + def test_multiple_analyzers(self, spark, sample_df): + """Test running multiple analyzers together.""" + result = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("id")) + .addAnalyzer(Completeness("email")) + .addAnalyzer(Mean("age")) + .addAnalyzer(Mean("score")) + .run() + ) + + rows = result.collect() + + # Should have results for all analyzers + assert len(rows) >= 5 + + +class TestEdgeCasesE2E: + """Test edge cases and error handling.""" + + def test_empty_dataframe(self, spark): + """Test verification on empty DataFrame.""" + empty_df = spark.createDataFrame([], "id: int, name: string") + + check = Check(CheckLevel.Error, "Empty DF check").hasSize(eq(0)) + + result = VerificationSuite(spark).onData(empty_df).addCheck(check).run() + + rows = result.collect() + assert len(rows) > 0 + assert rows[0]["constraint_status"] == "Success" + + def test_all_null_column(self, spark): + """Test completeness on all-NULL column.""" + from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + schema = StructType([ + StructField("id", IntegerType(), False), + StructField("val", StringType(), True), + ]) + data = [Row(id=1, val=None), Row(id=2, val=None)] + df = spark.createDataFrame(data, schema=schema) + + check = Check(CheckLevel.Error, "Null column check").hasCompleteness( + "val", eq(0.0) + ) + + result = VerificationSuite(spark).onData(df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_single_row(self, spark): + """Test verification on single-row DataFrame.""" + data = [Row(id=1, name="Test")] + df = spark.createDataFrame(data) + + check = ( + Check(CheckLevel.Error, "Single row check") + .hasSize(eq(1)) + .isComplete("id") + .isUnique("id") + ) + + result = VerificationSuite(spark).onData(df).addCheck(check).run() + + rows = result.collect() + for row in rows: + assert row["constraint_status"] == "Success" + + +class TestPredicatesE2E: + """Test various predicates via Spark Connect.""" + + def test_eq_predicate(self, spark, sample_df): + """Test eq() predicate.""" + check = Check(CheckLevel.Error, "EQ test").hasSize(eq(5)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_gte_predicate(self, spark, sample_df): + """Test gte() predicate.""" + check = Check(CheckLevel.Error, "GTE test").hasCompleteness("id", gte(1.0)) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_between_predicate(self, spark, sample_df): + """Test between() predicate.""" + check = Check(CheckLevel.Error, "Between test").hasMean( + "score", between(80.0, 95.0) + ) + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_lt_predicate(self, spark, sample_df): + """Test lt() predicate - should fail when condition not met.""" + check = Check(CheckLevel.Error, "LT test").hasSize( + lt(3) + ) # We have 5 rows, so this should fail + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Failure" + + +class TestColumnProfilerE2E: + """End-to-end tests for Column Profiler via Spark Connect.""" + + def test_basic_profiling(self, spark, sample_df): + """Test basic column profiling.""" + result = ColumnProfilerRunner(spark).onData(sample_df).run() + + rows = result.collect() + + # Should have one profile per column + assert len(rows) == len(sample_df.columns) + + # Verify columns are profiled + profiled_columns = {r["column"] for r in rows} + expected_columns = set(sample_df.columns) + assert profiled_columns == expected_columns + + def test_completeness_profiling(self, spark, sample_df): + """Test completeness values in profiles.""" + result = ColumnProfilerRunner(spark).onData(sample_df).run() + + rows = {r["column"]: r for r in result.collect()} + + # id is complete (100%) + assert rows["id"]["completeness"] == 1.0 + + # email has one null (80%) + assert abs(rows["email"]["completeness"] - 0.8) < 0.001 + + # age has one null (80%) + assert abs(rows["age"]["completeness"] - 0.8) < 0.001 + + def test_numeric_statistics_profiling(self, spark, sample_df): + """Test numeric statistics in profiles.""" + result = ColumnProfilerRunner(spark).onData(sample_df).run() + + rows = {r["column"]: r for r in result.collect()} + + # Verify score statistics + score_profile = rows["score"] + assert score_profile["minimum"] == 78.5 + assert score_profile["maximum"] == 95.0 + assert score_profile["mean"] is not None + + def test_restrict_to_columns(self, spark, sample_df): + """Test profiling restricted to specific columns.""" + result = ( + ColumnProfilerRunner(spark) + .onData(sample_df) + .restrictToColumns(["id", "name"]) + .run() + ) + + rows = result.collect() + profiled_columns = {r["column"] for r in rows} + + assert profiled_columns == {"id", "name"} + + def test_kll_profiling(self, spark, sample_df): + """Test KLL sketch profiling for numeric columns.""" + result = ( + ColumnProfilerRunner(spark) + .onData(sample_df) + .withKLLProfiling() + .run() + ) + + rows = {r["column"]: r for r in result.collect()} + + # Numeric columns should have KLL buckets + assert rows["score"]["kll_buckets"] is not None + assert rows["age"]["kll_buckets"] is not None + + # String columns should not have KLL buckets + assert rows["name"]["kll_buckets"] is None + + def test_kll_custom_parameters(self, spark, sample_df): + """Test KLL profiling with custom parameters.""" + params = KLLParameters(sketch_size=1024, shrinking_factor=0.5, num_buckets=32) + result = ( + ColumnProfilerRunner(spark) + .onData(sample_df) + .withKLLProfiling() + .setKLLParameters(params) + .run() + ) + + # Verify it runs without error + assert result.count() > 0 + + def test_histogram_threshold(self, spark, sample_df): + """Test histogram computation for low cardinality columns.""" + result = ( + ColumnProfilerRunner(spark) + .onData(sample_df) + .withLowCardinalityHistogramThreshold(10) + .run() + ) + + rows = {r["column"]: r for r in result.collect()} + + # id has 5 distinct values, should have histogram + assert rows["id"]["histogram"] is not None + + +class TestConstraintSuggestionsE2E: + """End-to-end tests for Constraint Suggestions via Spark Connect.""" + + def test_default_rules(self, spark, sample_df): + """Test DEFAULT rules generate suggestions.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + + # Should generate some suggestions + assert len(rows) > 0 + + # Check required columns + columns = result.columns + assert "column_name" in columns + assert "constraint_name" in columns + assert "code_for_constraint" in columns + + def test_extended_rules(self, spark, sample_df): + """Test EXTENDED rules generate comprehensive suggestions.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.EXTENDED) + .run() + ) + + # Extended rules should generate suggestions + assert result.count() >= 0 + + def test_restrict_to_columns(self, spark, sample_df): + """Test suggestions restricted to specific columns.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .restrictToColumns(["id", "name"]) + .run() + ) + + rows = result.collect() + columns_with_suggestions = set(r["column_name"] for r in rows) + + # Only restricted columns should have suggestions + assert columns_with_suggestions.issubset({"id", "name"}) + + def test_train_test_split(self, spark, sample_df): + """Test train/test split evaluation.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(0.3, seed=42) + .run() + ) + + # Should have evaluation columns + assert "evaluation_status" in result.columns + assert "evaluation_metric_value" in result.columns + + def test_code_for_constraint(self, spark, sample_df): + """Test code_for_constraint is properly formatted.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + for row in rows: + code = row["code_for_constraint"] + # Should be non-empty + assert code is not None + assert len(code) > 0 + # Should not have Scala-specific syntax + assert "Some(" not in code + assert "Seq(" not in code + + def test_suggestion_to_check_workflow(self, spark, sample_df): + """Test end-to-end workflow: get suggestions and verify data.""" + # Step 1: Get suggestions + suggestions = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + suggestion_rows = suggestions.collect() + assert len(suggestion_rows) > 0 + + # Step 2: Use suggestions to build verification + # Find a completeness suggestion for 'id' + id_suggestions = [ + s for s in suggestion_rows + if s["column_name"] == "id" and "Completeness" in s["constraint_name"] + ] + + if id_suggestions: + # We have a completeness suggestion - verify it with a check + check = Check(CheckLevel.Error, "From suggestion").isComplete("id") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + +class TestCombinedFeaturesE2E: + """Test combining multiple V2 features in workflows.""" + + def test_profile_then_verify(self, spark, sample_df): + """Test workflow: profile data, then verify based on findings.""" + # Step 1: Profile the data + profiles = ColumnProfilerRunner(spark).onData(sample_df).run() + + profile_rows = {r["column"]: r for r in profiles.collect()} + + # Step 2: Create checks based on profile findings + # If id is 100% complete, verify that + if profile_rows["id"]["completeness"] == 1.0: + check = Check(CheckLevel.Error, "Profile-based check").isComplete("id") + + result = VerificationSuite(spark).onData(sample_df).addCheck(check).run() + + rows = result.collect() + assert rows[0]["constraint_status"] == "Success" + + def test_analyze_profile_suggest(self, spark, sample_df): + """Test combined workflow: analyze, profile, and get suggestions.""" + # Step 1: Run analysis + analysis = ( + AnalysisRunner(spark) + .onData(sample_df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("id")) + .run() + ) + analysis_rows = analysis.collect() + assert len(analysis_rows) >= 2 + + # Step 2: Profile columns + profiles = ColumnProfilerRunner(spark).onData(sample_df).run() + profile_rows = profiles.collect() + assert len(profile_rows) == len(sample_df.columns) + + # Step 3: Get suggestions + suggestions = ( + ConstraintSuggestionRunner(spark) + .onData(sample_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + suggestion_rows = suggestions.collect() + assert len(suggestion_rows) >= 0 # May be empty for small datasets + + +if __name__ == "__main__": + # Run tests directly + pytest.main([__file__, "-v"]) diff --git a/tests/v2/test_profiles.py b/tests/v2/test_profiles.py new file mode 100644 index 0000000..e1c8030 --- /dev/null +++ b/tests/v2/test_profiles.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- +""" +Tests for Column Profiler functionality. + +These tests verify that the Column Profiler correctly analyzes DataFrame columns +and returns expected statistics. +""" + +import json + +import pytest +from pyspark.sql import Row + +from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters + + +class TestBasicProfiling: + """Test basic profiling metrics.""" + + def test_completeness_calculation(self, spark, profiler_df): + """Test completeness is correctly calculated.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + # id column is complete (8/8 = 1.0) + assert rows["id"]["completeness"] == 1.0 + + # salary has 1 null out of 8 (7/8 = 0.875) + assert abs(rows["salary"]["completeness"] - 7 / 8) < 0.001 + + # age has 1 null out of 8 + assert abs(rows["age"]["completeness"] - 7 / 8) < 0.001 + + def test_data_type_inference(self, spark, profiler_df): + """Test data types are correctly inferred.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + # Check data types contain expected type indicators + # Deequ returns "Integral" for integer types + assert ( + "Integral" in rows["id"]["data_type"] + or "Integer" in rows["id"]["data_type"] + or "Long" in rows["id"]["data_type"] + ) + assert "String" in rows["name"]["data_type"] + # Deequ returns "Fractional" for double types + assert ( + "Fractional" in rows["salary"]["data_type"] + or "Double" in rows["salary"]["data_type"] + ) + assert "Boolean" in rows["active"]["data_type"] + + def test_approx_distinct_values(self, spark, profiler_df): + """Test approximate distinct value count.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + # id should have 8 distinct values + assert rows["id"]["approx_distinct_values"] == 8 + + # active (boolean) should have 2 distinct values + assert rows["active"]["approx_distinct_values"] == 2 + + def test_all_columns_profiled(self, spark, profiler_df): + """Test that all columns are profiled by default.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = result.collect() + + expected_columns = {"id", "name", "age", "salary", "active", "email", "score"} + profiled_columns = {r["column"] for r in rows} + + assert profiled_columns == expected_columns + + +class TestNumericProfiling: + """Test numeric column profiling.""" + + def test_numeric_statistics(self, spark, profiler_df): + """Test mean, min, max, sum, stddev for numeric columns.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + age_profile = rows["age"] + # age values: 30, 25, 35, 28, None, 45, 32, 29 + # min=25, max=45 + assert age_profile["minimum"] == 25.0 + assert age_profile["maximum"] == 45.0 + assert age_profile["mean"] is not None + assert age_profile["std_dev"] is not None + + def test_non_numeric_has_null_stats(self, spark, profiler_df): + """Test non-numeric columns have null for numeric stats.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + name_profile = rows["name"] + assert name_profile["mean"] is None + assert name_profile["minimum"] is None + assert name_profile["maximum"] is None + + +class TestKLLProfiling: + """Test KLL sketch profiling.""" + + def test_kll_disabled_by_default(self, spark, profiler_df): + """Test KLL is not computed by default.""" + result = ColumnProfilerRunner(spark).onData(profiler_df).run() + rows = {r["column"]: r for r in result.collect()} + + assert rows["age"]["kll_buckets"] is None + + def test_kll_enabled(self, spark, profiler_df): + """Test KLL buckets are computed when enabled.""" + result = ( + ColumnProfilerRunner(spark).onData(profiler_df).withKLLProfiling().run() + ) + rows = {r["column"]: r for r in result.collect()} + + # Numeric columns should have KLL buckets + assert rows["age"]["kll_buckets"] is not None + assert rows["salary"]["kll_buckets"] is not None + # Non-numeric should not + assert rows["name"]["kll_buckets"] is None + + def test_kll_custom_parameters(self, spark, profiler_df): + """Test custom KLL parameters are applied.""" + params = KLLParameters(sketch_size=1024, shrinking_factor=0.5, num_buckets=32) + result = ( + ColumnProfilerRunner(spark) + .onData(profiler_df) + .withKLLProfiling() + .setKLLParameters(params) + .run() + ) + # Just verify it runs without error + assert result.count() > 0 + + +class TestProfilerOptions: + """Test profiler configuration options.""" + + def test_restrict_to_columns(self, spark, profiler_df): + """Test restricting profiling to specific columns.""" + result = ( + ColumnProfilerRunner(spark) + .onData(profiler_df) + .restrictToColumns(["id", "name"]) + .run() + ) + + columns = [r["column"] for r in result.collect()] + assert set(columns) == {"id", "name"} + + def test_low_cardinality_histogram(self, spark, profiler_df): + """Test histogram is computed for low cardinality columns.""" + result = ( + ColumnProfilerRunner(spark) + .onData(profiler_df) + .withLowCardinalityHistogramThreshold(10) + .run() + ) + rows = {r["column"]: r for r in result.collect()} + + # active (2 values) should have histogram + assert rows["active"]["histogram"] is not None + # Verify histogram is valid JSON + histogram = json.loads(rows["active"]["histogram"]) + assert len(histogram) > 0 + + def test_predefined_types(self, spark, profiler_df): + """Test predefined types override inference.""" + result = ( + ColumnProfilerRunner(spark) + .onData(profiler_df) + .setPredefinedTypes({"id": "String"}) + .run() + ) + rows = {r["column"]: r for r in result.collect()} + + assert rows["id"]["is_data_type_inferred"] is False + + +class TestProfilerEdgeCases: + """Test edge cases for profiler.""" + + def test_all_null_column(self, spark): + """Test profiling column with all nulls.""" + from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + schema = StructType( + [ + StructField("id", IntegerType(), False), + StructField("value", StringType(), True), + ] + ) + df = spark.createDataFrame( + [(1, None), (2, None)], + schema=schema, + ) + result = ColumnProfilerRunner(spark).onData(df).run() + rows = {r["column"]: r for r in result.collect()} + + assert rows["value"]["completeness"] == 0.0 + + def test_single_row(self, spark): + """Test profiling single row DataFrame.""" + df = spark.createDataFrame([Row(id=1, value=100)]) + result = ColumnProfilerRunner(spark).onData(df).run() + rows = {r["column"]: r for r in result.collect()} + + assert rows["value"]["minimum"] == 100.0 + assert rows["value"]["maximum"] == 100.0 + assert rows["value"]["completeness"] == 1.0 + + def test_large_dataframe(self, spark): + """Test profiling larger DataFrame.""" + df = spark.createDataFrame( + [Row(id=i, value=i * 10, category=f"cat_{i % 5}") for i in range(1000)] + ) + result = ColumnProfilerRunner(spark).onData(df).run() + rows = {r["column"]: r for r in result.collect()} + + # Allow some approximation error for HyperLogLog-based distinct count + assert rows["id"]["approx_distinct_values"] >= 950 + assert rows["category"]["approx_distinct_values"] == 5 + + +class TestKLLParametersUnit: + """Unit tests for KLLParameters (no Spark needed).""" + + def test_default_parameters(self): + """Test default KLL parameters.""" + params = KLLParameters() + assert params.sketch_size == 2048 + assert params.shrinking_factor == 0.64 + assert params.num_buckets == 64 + + def test_custom_parameters(self): + """Test custom KLL parameters.""" + params = KLLParameters(sketch_size=1024, shrinking_factor=0.5, num_buckets=32) + assert params.sketch_size == 1024 + assert params.shrinking_factor == 0.5 + assert params.num_buckets == 32 + + def test_to_proto(self): + """Test conversion to protobuf.""" + params = KLLParameters(sketch_size=512, shrinking_factor=0.7, num_buckets=16) + proto_msg = params.to_proto() + + # Proto uses snake_case field names + assert proto_msg.sketch_size == 512 + assert proto_msg.shrinking_factor == 0.7 + assert proto_msg.number_of_buckets == 16 diff --git a/tests/v2/test_suggestions.py b/tests/v2/test_suggestions.py new file mode 100644 index 0000000..360b10b --- /dev/null +++ b/tests/v2/test_suggestions.py @@ -0,0 +1,330 @@ +# -*- coding: utf-8 -*- +""" +Tests for Constraint Suggestion functionality. + +These tests verify that the Constraint Suggestion module correctly analyzes +DataFrame columns and suggests appropriate data quality constraints. +""" + +import pytest +from pyspark.sql import Row + +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + +class TestBasicSuggestions: + """Test basic constraint suggestion generation.""" + + def test_default_rules_generate_suggestions(self, spark, suggestion_df): + """Test DEFAULT rules generate suggestions.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + assert len(rows) > 0 + + # Check required columns exist + columns = result.columns + assert "column_name" in columns + assert "constraint_name" in columns + assert "code_for_constraint" in columns + assert "description" in columns + assert "suggesting_rule" in columns + + def test_completeness_suggestion(self, spark, suggestion_df): + """Test completeness constraints are suggested for complete columns.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + id_suggestions = [r for r in rows if r["column_name"] == "id"] + + # id column is complete, should have completeness-related suggestion + constraint_names = [s["constraint_name"] for s in id_suggestions] + assert any( + "Complete" in name or "NotNull" in name or "Completeness" in name + for name in constraint_names + ) + + def test_categorical_suggestion(self, spark, suggestion_df): + """Test categorical constraints are suggested for low-cardinality columns.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + status_suggestions = [r for r in rows if r["column_name"] == "status"] + + constraint_names = [s["constraint_name"] for s in status_suggestions] + # Should suggest IsIn/Contained for categorical column (3 distinct values) + has_categorical = any( + "IsIn" in name or "Contained" in name or "Categorical" in name + for name in constraint_names + ) + # If no categorical suggestion, at least verify we got some suggestions + assert has_categorical or len(constraint_names) > 0 + + +class TestRulesCombinations: + """Test different rule combinations.""" + + def test_numerical_rules(self, spark, suggestion_df): + """Test NUMERICAL rules generate statistical constraints.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.NUMERICAL) + .run() + ) + + rows = result.collect() + score_suggestions = [r for r in rows if r["column_name"] == "score"] + + # Numerical rules should suggest min/max/mean constraints for numeric column + constraint_names = [s["constraint_name"] for s in score_suggestions] + has_numeric_constraint = any( + name in ["HasMin", "HasMax", "HasMean", "Minimum", "Maximum", "Mean"] + or "Min" in name + or "Max" in name + for name in constraint_names + ) + # Either we have numeric constraints or the rule set is empty + assert has_numeric_constraint or len(rows) == 0 + + def test_extended_rules(self, spark, suggestion_df): + """Test EXTENDED rules include all rule types.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.EXTENDED) + .run() + ) + + extended_count = result.count() + + # Extended should generate suggestions + assert extended_count >= 0 + + def test_multiple_rules_combined(self, spark, suggestion_df): + """Test adding multiple rule sets.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .addConstraintRules(Rules.NUMERICAL) + .run() + ) + + assert result.count() >= 0 + + def test_common_rules_uniqueness(self, spark, suggestion_df): + """Test COMMON rules suggest uniqueness for unique columns.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.COMMON) + .run() + ) + + rows = result.collect() + id_suggestions = [r for r in rows if r["column_name"] == "id"] + + constraint_names = [s["constraint_name"] for s in id_suggestions] + # id column is unique, should potentially get uniqueness suggestion + has_unique = any("Unique" in name for name in constraint_names) + # If no unique suggestion, at least verify we ran without error + assert has_unique or len(rows) >= 0 + + +class TestTrainTestSplit: + """Test train/test split evaluation.""" + + def test_train_test_split_evaluation(self, spark, suggestion_df): + """Test suggestions are evaluated on test set.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(0.3) + .run() + ) + + rows = result.collect() + # When train/test split is used, evaluation columns should exist + assert "evaluation_status" in result.columns + assert "evaluation_metric_value" in result.columns + + def test_train_test_with_seed(self, spark, suggestion_df): + """Test reproducible train/test split with seed.""" + result1 = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(0.3, seed=42) + .run() + ) + + result2 = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(0.3, seed=42) + .run() + ) + + # Same seed should produce same suggestion count + assert result1.count() == result2.count() + + def test_train_test_invalid_ratio(self, spark, suggestion_df): + """Test invalid train/test ratio raises error.""" + with pytest.raises(ValueError, match="between 0.0 and 1.0"): + ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(1.5) + .run() + ) + + with pytest.raises(ValueError, match="between 0.0 and 1.0"): + ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .useTrainTestSplitWithTestsetRatio(0.0) + .run() + ) + + +class TestSuggestionOptions: + """Test suggestion configuration options.""" + + def test_restrict_to_columns(self, spark, suggestion_df): + """Test restricting suggestions to specific columns.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .restrictToColumns(["id", "status"]) + .run() + ) + + rows = result.collect() + columns_with_suggestions = set(r["column_name"] for r in rows) + + # Should only have suggestions for restricted columns + assert columns_with_suggestions.issubset({"id", "status"}) + + def test_code_for_constraint_format(self, spark, suggestion_df): + """Test code_for_constraint is valid Python-like syntax.""" + result = ( + ConstraintSuggestionRunner(spark) + .onData(suggestion_df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + for row in rows: + code = row["code_for_constraint"] + # Should be non-empty string + assert code is not None + assert len(code) > 0 + # Should not contain Scala-specific syntax (after conversion) + assert "Some(" not in code + assert "Seq(" not in code + + def test_no_rules_raises_error(self, spark, suggestion_df): + """Test that running without rules raises an error.""" + with pytest.raises(ValueError, match="At least one constraint rule"): + ConstraintSuggestionRunner(spark).onData(suggestion_df).run() + + +class TestSuggestionEdgeCases: + """Test edge cases for suggestions.""" + + def test_single_row(self, spark): + """Test suggestions on single row DataFrame.""" + df = spark.createDataFrame([Row(id=1, value="test")]) + result = ( + ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + # Should handle gracefully + assert result.count() >= 0 + + def test_high_cardinality_column(self, spark): + """Test suggestions for high cardinality column.""" + df = spark.createDataFrame( + [Row(id=i, unique_value=f"value_{i}") for i in range(100)] + ) + result = ( + ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + unique_suggestions = [r for r in rows if r["column_name"] == "unique_value"] + + # Should NOT suggest IsIn for high cardinality + constraint_names = [s["constraint_name"] for s in unique_suggestions] + assert not any("IsIn" in name for name in constraint_names) + + def test_all_null_column(self, spark): + """Test suggestions for column with all nulls.""" + from pyspark.sql.types import IntegerType, StringType, StructField, StructType + + schema = StructType( + [ + StructField("id", IntegerType(), False), + StructField("value", StringType(), True), + ] + ) + df = spark.createDataFrame( + [(1, None), (2, None), (3, None)], + schema=schema, + ) + result = ( + ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run() + ) + + rows = result.collect() + # Should handle all-null column gracefully + assert len(rows) >= 0 + + +class TestRulesEnum: + """Unit tests for Rules enum (no Spark needed).""" + + def test_rules_values(self): + """Test Rules enum has expected values.""" + assert Rules.DEFAULT.value == "DEFAULT" + assert Rules.STRING.value == "STRING" + assert Rules.NUMERICAL.value == "NUMERICAL" + assert Rules.COMMON.value == "COMMON" + assert Rules.EXTENDED.value == "EXTENDED" + + def test_all_rules_defined(self): + """Test all expected rules are defined.""" + expected_rules = {"DEFAULT", "STRING", "NUMERICAL", "COMMON", "EXTENDED"} + actual_rules = {r.value for r in Rules} + assert actual_rules == expected_rules diff --git a/tests/v2/test_unit.py b/tests/v2/test_unit.py new file mode 100644 index 0000000..766e6ef --- /dev/null +++ b/tests/v2/test_unit.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +""" +Unit tests for PyDeequ V2 Spark Connect module. + +These tests verify the Python client API works correctly without +requiring a Spark session. They test protobuf serialization of +predicates, checks, and analyzers. +""" + +import unittest + +from pydeequ.v2 import ( + # Checks + Check, + CheckLevel, + Completeness, + Mean, + # Analyzers + Size, + between, + # Predicates + eq, + gte, + is_one, +) +class TestPredicates(unittest.TestCase): + """Test predicate serialization. + + These tests use hardcoded numeric values for operator enums to detect + proto sync issues between deequ (source of truth) and python-deequ. + + Expected values from deequ_connect.proto: + UNSPECIFIED = 0, EQ = 1, NE = 2, GT = 3, GE = 4, LT = 5, LE = 6, BETWEEN = 7 + """ + + def test_eq_predicate(self): + p = eq(100) + proto = p.to_proto() + self.assertEqual(proto.operator, 1) # EQ + self.assertEqual(proto.value, 100.0) + + def test_gte_predicate(self): + p = gte(0.95) + proto = p.to_proto() + self.assertEqual(proto.operator, 4) # GE + self.assertEqual(proto.value, 0.95) + + def test_between_predicate(self): + p = between(10, 20) + proto = p.to_proto() + self.assertEqual(proto.operator, 7) # BETWEEN + self.assertEqual(proto.lower_bound, 10.0) + self.assertEqual(proto.upper_bound, 20.0) + + def test_is_one_predicate(self): + p = is_one() + proto = p.to_proto() + self.assertEqual(proto.operator, 1) # EQ + self.assertEqual(proto.value, 1.0) + + +class TestCheckBuilder(unittest.TestCase): + """Test Check class protobuf building.""" + + def test_check_with_constraints(self): + check = ( + Check(CheckLevel.Error, "Test check") + .isComplete("id") + .hasCompleteness("email", gte(0.95)) + .hasSize(eq(100)) + ) + + proto = check.to_proto() + + self.assertEqual(proto.level, 0) # ERROR + self.assertEqual(proto.description, "Test check") + self.assertEqual(len(proto.constraints), 3) + + # Check constraint types + self.assertEqual(proto.constraints[0].type, "isComplete") + self.assertEqual(proto.constraints[0].column, "id") + + self.assertEqual(proto.constraints[1].type, "hasCompleteness") + self.assertEqual(proto.constraints[1].column, "email") + + self.assertEqual(proto.constraints[2].type, "hasSize") + + def test_check_warning_level(self): + check = Check(CheckLevel.Warning, "Warning check") + proto = check.to_proto() + self.assertEqual(proto.level, 1) # WARNING + + +class TestAnalyzerBuilder(unittest.TestCase): + """Test Analyzer classes protobuf building.""" + + def test_size_analyzer(self): + analyzer = Size() + proto = analyzer.to_proto() + self.assertEqual(proto.type, "Size") + + def test_completeness_analyzer(self): + analyzer = Completeness("email") + proto = analyzer.to_proto() + self.assertEqual(proto.type, "Completeness") + self.assertEqual(proto.column, "email") + + def test_mean_analyzer(self): + analyzer = Mean("amount") + proto = analyzer.to_proto() + self.assertEqual(proto.type, "Mean") + self.assertEqual(proto.column, "amount") + + def test_analyzer_with_where(self): + analyzer = Size(where="status = 'active'") + proto = analyzer.to_proto() + self.assertEqual(proto.type, "Size") + self.assertEqual(proto.where, "status = 'active'") + + +if __name__ == "__main__": + unittest.main() diff --git a/tutorials/data_quality_example_v2.py b/tutorials/data_quality_example_v2.py new file mode 100644 index 0000000..2f343c6 --- /dev/null +++ b/tutorials/data_quality_example_v2.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 +""" +Testing Data Quality at Scale with PyDeequ V2 + +This example demonstrates the PyDeequ 2.0 API using Spark Connect. +It covers data analysis, constraint verification, column profiling, +and constraint suggestions. + +Prerequisites: +1. Start the Spark Connect server with the Deequ plugin: + + $SPARK_HOME/sbin/start-connect-server.sh \ + --packages org.apache.spark:spark-connect_2.12:3.5.0 \ + --jars /path/to/deequ_2.12-2.1.0b-spark-3.5.jar \ + --conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin + +2. Run this script: + SPARK_REMOTE=sc://localhost:15002 python data_quality_example_v2.py +""" + +import os +from pyspark.sql import SparkSession, Row + +# PyDeequ V2 imports +from pydeequ.v2.analyzers import ( + Size, + Completeness, + Distinctness, + Mean, + Minimum, + Maximum, + StandardDeviation, + Correlation, +) +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import AnalysisRunner, VerificationSuite +from pydeequ.v2.predicates import eq, gte, lte, between +from pydeequ.v2.profiles import ColumnProfilerRunner +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + +def create_sample_data(spark: SparkSession): + """Create a sample product reviews dataset for demonstration.""" + data = [ + # Normal reviews + Row(review_id="R001", customer_id="C100", product_id="P001", + marketplace="US", star_rating=5, helpful_votes=10, total_votes=12, + review_year=2023, product_title="Great Product", insight="Y"), + Row(review_id="R002", customer_id="C101", product_id="P002", + marketplace="US", star_rating=4, helpful_votes=8, total_votes=10, + review_year=2023, product_title="Good Value", insight="Y"), + Row(review_id="R003", customer_id="C102", product_id="P001", + marketplace="UK", star_rating=5, helpful_votes=15, total_votes=18, + review_year=2022, product_title="Great Product", insight="N"), + Row(review_id="R004", customer_id="C103", product_id="P003", + marketplace="DE", star_rating=3, helpful_votes=5, total_votes=8, + review_year=2022, product_title="Decent Item", insight="Y"), + Row(review_id="R005", customer_id="C104", product_id="P002", + marketplace="FR", star_rating=4, helpful_votes=12, total_votes=15, + review_year=2021, product_title="Good Value", insight="N"), + Row(review_id="R006", customer_id="C105", product_id="P004", + marketplace="JP", star_rating=5, helpful_votes=20, total_votes=22, + review_year=2023, product_title="Excellent!", insight="Y"), + Row(review_id="R007", customer_id="C106", product_id="P001", + marketplace="US", star_rating=2, helpful_votes=3, total_votes=10, + review_year=2020, product_title="Great Product", insight="N"), + Row(review_id="R008", customer_id="C107", product_id="P005", + marketplace="UK", star_rating=1, helpful_votes=25, total_votes=30, + review_year=2021, product_title="Disappointing", insight="Y"), + # Review with missing marketplace (data quality issue) + Row(review_id="R009", customer_id="C108", product_id="P002", + marketplace=None, star_rating=4, helpful_votes=7, total_votes=9, + review_year=2023, product_title="Good Value", insight="Y"), + # Duplicate review_id (data quality issue) + Row(review_id="R001", customer_id="C109", product_id="P003", + marketplace="US", star_rating=3, helpful_votes=4, total_votes=6, + review_year=2022, product_title="Decent Item", insight="N"), + ] + return spark.createDataFrame(data) + + +def run_data_analysis(spark: SparkSession, df): + """ + Run data analysis using AnalysisRunner. + + This demonstrates computing various metrics on the dataset: + - Size: Total row count + - Completeness: Ratio of non-null values + - Distinctness: Ratio of distinct values + - Mean, Min, Max: Statistical measures + - Correlation: Relationship between columns + """ + print("\n" + "=" * 60) + print("DATA ANALYSIS") + print("=" * 60) + + result = (AnalysisRunner(spark) + .onData(df) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("review_id")) + .addAnalyzer(Completeness("marketplace")) + .addAnalyzer(Distinctness("review_id")) + .addAnalyzer(Mean("star_rating")) + .addAnalyzer(Minimum("star_rating")) + .addAnalyzer(Maximum("star_rating")) + .addAnalyzer(StandardDeviation("star_rating")) + .addAnalyzer(Correlation("total_votes", "helpful_votes")) + .run()) + + print("\nAnalysis Results:") + result.show(truncate=False) + + # Extract key insights + rows = result.collect() + metrics = {(r["name"], r["instance"]): r["value"] for r in rows} + + print("\nKey Insights:") + print(f" - Dataset contains {int(metrics.get(('Size', '*'), 0))} reviews") + print(f" - review_id completeness: {metrics.get(('Completeness', 'review_id'), 0):.1%}") + print(f" - marketplace completeness: {metrics.get(('Completeness', 'marketplace'), 0):.1%}") + print(f" - review_id distinctness: {metrics.get(('Distinctness', 'review_id'), 0):.1%}") + print(f" - Average star rating: {metrics.get(('Mean', 'star_rating'), 0):.2f}") + print(f" - Star rating range: {metrics.get(('Minimum', 'star_rating'), 0):.0f} - {metrics.get(('Maximum', 'star_rating'), 0):.0f}") + + return result + + +def run_constraint_verification(spark: SparkSession, df): + """ + Run constraint verification using VerificationSuite. + + This demonstrates defining and verifying data quality rules: + - Size checks + - Completeness checks + - Uniqueness checks + - Range checks (min/max) + - Categorical value checks + """ + print("\n" + "=" * 60) + print("CONSTRAINT VERIFICATION") + print("=" * 60) + + # Define checks using the V2 predicate API + # Note: In V2, we use predicates like eq(), gte(), between() instead of lambdas + check = (Check(CheckLevel.Warning, "Product Reviews Quality Check") + # Size check: at least 5 reviews + .hasSize(gte(5)) + # Completeness checks + .isComplete("review_id") + .isComplete("customer_id") + .hasCompleteness("marketplace", gte(0.8)) # Allow some missing + # Uniqueness check + .isUnique("review_id") + # Star rating range check + .hasMin("star_rating", eq(1.0)) + .hasMax("star_rating", eq(5.0)) + .hasMean("star_rating", between(1.0, 5.0)) + # Year range check + .hasMin("review_year", gte(2015)) + .hasMax("review_year", lte(2025)) + # Categorical check + .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"]) + .isContainedIn("insight", ["Y", "N"]) + ) + + result = (VerificationSuite(spark) + .onData(df) + .addCheck(check) + .run()) + + print("\nVerification Results:") + result.show(truncate=False) + + # Summarize results + rows = result.collect() + passed = sum(1 for r in rows if r["constraint_status"] == "Success") + failed = sum(1 for r in rows if r["constraint_status"] == "Failure") + + print(f"\nSummary: {passed} passed, {failed} failed out of {len(rows)} constraints") + + if failed > 0: + print("\nFailed Constraints:") + for r in rows: + if r["constraint_status"] == "Failure": + print(f" - {r['constraint']}") + if r["constraint_message"]: + print(f" Message: {r['constraint_message']}") + + return result + + +def run_column_profiling(spark: SparkSession, df): + """ + Run column profiling using ColumnProfilerRunner. + + This automatically computes statistics for each column: + - Completeness + - Approximate distinct values + - Data type detection + - Numeric statistics (mean, min, max, etc.) + - Optional: KLL sketches for approximate quantiles + """ + print("\n" + "=" * 60) + print("COLUMN PROFILING") + print("=" * 60) + + result = (ColumnProfilerRunner(spark) + .onData(df) + .withLowCardinalityHistogramThreshold(10) # Generate histograms for low-cardinality columns + .run()) + + print("\nColumn Profiles:") + # Show selected columns for readability + result.select( + "column", "completeness", "approx_distinct_values", + "data_type", "mean", "minimum", "maximum" + ).show(truncate=False) + + return result + + +def run_constraint_suggestions(spark: SparkSession, df): + """ + Run automated constraint suggestion using ConstraintSuggestionRunner. + + This analyzes the data and suggests appropriate constraints: + - Completeness constraints for complete columns + - Uniqueness constraints for unique columns + - Categorical range constraints for low-cardinality columns + - Non-negative constraints for numeric columns + """ + print("\n" + "=" * 60) + print("CONSTRAINT SUGGESTIONS") + print("=" * 60) + + result = (ConstraintSuggestionRunner(spark) + .onData(df) + .addConstraintRules(Rules.DEFAULT) + .run()) + + print("\nSuggested Constraints:") + result.select( + "column_name", "constraint_name", "description", "code_for_constraint" + ).show(truncate=False) + + # Count suggestions per column + rows = result.collect() + print(f"\nTotal suggestions: {len(rows)}") + + return result + + +def main(): + # Get Spark Connect URL from environment + spark_remote = os.environ.get("SPARK_REMOTE", "sc://localhost:15002") + + print("PyDeequ V2 Data Quality Example") + print(f"Connecting to: {spark_remote}") + + # Create Spark Connect session + spark = SparkSession.builder.remote(spark_remote).getOrCreate() + + try: + # Create sample data + print("\nCreating sample product reviews dataset...") + df = create_sample_data(spark) + + print("\nDataset Schema:") + df.printSchema() + + print("\nSample Data:") + df.show(truncate=False) + + # Run all examples + run_data_analysis(spark, df) + run_constraint_verification(spark, df) + run_column_profiling(spark, df) + run_constraint_suggestions(spark, df) + + print("\n" + "=" * 60) + print("EXAMPLE COMPLETE") + print("=" * 60) + + finally: + spark.stop() + + +if __name__ == "__main__": + main()