Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion app/schemas/players/transfers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import date
from typing import Optional
from typing import Optional, Literal

from app.schemas.base import AuditMixin, TransfermarktBaseModel

Expand All @@ -18,6 +18,7 @@ class PlayerTransfer(TransfermarktBaseModel):
season: str
market_value: Optional[int]
fee: Optional[int]
transfer_type: Literal["permanent", "loan", "end_of_loan", "free_transfer"]


class PlayerTransfers(TransfermarktBaseModel, AuditMixin):
Expand Down
65 changes: 64 additions & 1 deletion app/services/players/transfers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from dataclasses import dataclass
from bs4 import BeautifulSoup
import re
from typing import Tuple

from app.services.base import TransfermarktBase
from app.utils.utils import extract_from_url, safe_split
Expand Down Expand Up @@ -26,6 +29,50 @@ def __post_init__(self) -> None:
self.raise_exception_if_not_found(xpath=Players.Profile.NAME)
self.transfer_history = self.make_request(url=self.URL_TRANSFERS.format(player_id=self.player_id))

def __clean_html_value(self, value: str) -> Tuple[str, str]:
"""
Clean HTML tags from a string value and extract just the currency value or preserve special text values.
Also determine the transfer type based on the fee text.

Args:
value (str): The string value that may contain HTML tags.

Returns:
Tuple[str, str]: A tuple containing (cleaned_fee_value, transfer_type)
"""
if not value or not isinstance(value, str):
return value, "permanent"

soup = BeautifulSoup(value, 'html.parser')
text = soup.get_text().strip()

if "end of loan" in text.lower():
return "End of loan", "end_of_loan"

if "loan transfer" in text.lower() and not any(char.isdigit() for char in text):
return "loan transfer", "loan"

if "fee" in text.lower():
currency_match = re.search(r'(€\d+(?:\.\d+)?[km]?)', text)
if currency_match:
if "loan" in text.lower():
return currency_match.group(1), "loan"
return currency_match.group(1), "permanent"

number_match = re.search(r'(\d+(?:\.\d+)?[km]?)', text)
if number_match:
if "loan" in text.lower():
return number_match.group(1), "loan"
return number_match.group(1), "permanent"

if "loan" in text.lower() and not any(char.isdigit() for char in text):
return "€0", "loan"

if "free transfer" in text.lower() or text == "-":
return "€0", "free_transfer"

return text, "permanent"

Comment on lines +32 to +75
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Verify regex pattern for currency extraction

The regex pattern used to extract currency values might not catch all formats. Consider expanding it to handle additional currency formats or edge cases.


🏁 Script executed:

#!/bin/bash
# Check for any transfer fee values in the codebase that might not match the current regex patterns

# Pattern 1: €\d+(?:\.\d+)?[km]?
# Pattern 2: \d+(?:\.\d+)?[km]?

# Check for any test data or mock responses containing fee values
rg -A1 -B1 -i "fee.*[€\$£]" --glob "*.json" --glob "*.py"

Length of output: 58


Below is a new verification step to more directly test the regex patterns against a few sample currency strings. This should help clarify whether the current patterns work as intended or need expansion for additional edge cases:


Regex Currency Extraction Verification

It appears the current regex implementation extracts standard euro formats (e.g., "€100", "€100.5", "€100k") effectively. However, note that:

  • Edge Cases: Formats with commas (e.g., "€100,00" or "€1,000") won’t match the current pattern.
  • Currency Flexibility: Only euro (i.e. "€") values are directly handled; if other currencies such as "$" or "£" might appear, additional patterns will be needed.
  • Fallback Behavior: The secondary numeric pattern (\d+(?:\.\d+)?[km]?) will catch values missing the currency symbol but might not address formatting with separators.

If the application context might include such variants, consider expanding or adding alternative regex patterns to handle these additional formats.

Please verify these behaviors against your expected input cases to ensure robustness.


🏁 Script executed:

#!/bin/bash
python3 - << 'EOF'
import re

# Define the regex patterns being used in the codebase
pattern_euro = re.compile(r'(€\d+(?:\.\d+)?[km]?)')
pattern_numeric = re.compile(r'(\d+(?:\.\d+)?[km]?)')

# Sample test inputs covering common and edge cases
test_values = [
    "€100",        # Standard euro value
    "€100.5",      # Euro value with decimal
    "€100k",       # Euro value with suffix k
    "€100.5m",     # Euro value with suffix m
    "$100",        # Dollar value (shouldn't match pattern_euro)
    "£100",        # Pound value (shouldn't match pattern_euro)
    "100",         # Numeric without currency, for fallback pattern
    "100.5",       # Numeric with decimal
    "100k",        # Numeric with suffix
    "€100,00",     # Euro value with a comma as decimal separator (edge case)
    "€1,000",      # Euro value with thousand separator (edge case)
    "loan transfer fee of €100"  # Context containing extra text
]

print("Pattern: (€\\d+(?:\\.\\d+)?[km]?)")
for s in test_values:
    m = pattern_euro.search(s)
    print(f"Input: {s!r} -> Extracted: {m.group(1) if m else None}")

print("\nPattern: (\\d+(?:\\.\\d+)?[km]?)")
for s in test_values:
    m = pattern_numeric.search(s)
    print(f"Input: {s!r} -> Extracted: {m.group(1) if m else None}")
EOF

Length of output: 1050


Currency Regex Enhancement Required

The current regex pattern (€\d+(?:\.\d+)?[km]?) reliably handles standard euro formats (e.g., "€100", "€100.5", "€100k"), but our tests show it fails for values with commas (e.g., "€100,00" and "€1,000"). This limits correct extraction of amounts using comma delimiters. Consider updating the regex pattern—for example, replacing (?:\.\d+)? with something like (?:[,.]\d+)?—to account for both period and comma as decimal separators. Also, if inputs may include other currencies (e.g., "$" or "£"), further adjustments may be necessary.

  • Location: app/services/players/transfers.py (Lines 32-75)
  • Action: Update the regex pattern to robustly capture currency values, including comma-separated decimals/thousands.
🧰 Tools
🪛 Ruff (0.8.2)

36-36: Blank line contains whitespace

Remove whitespace from blank line

(W293)


39-39: Blank line contains whitespace

Remove whitespace from blank line

(W293)


45-45: Blank line contains whitespace

Remove whitespace from blank line

(W293)


46-46: Single quotes found but double quotes preferred

Replace single quotes with double quotes

(Q000)


48-48: Blank line contains whitespace

Remove whitespace from blank line

(W293)


51-51: Blank line contains whitespace

Remove whitespace from blank line

(W293)


54-54: Blank line contains whitespace

Remove whitespace from blank line

(W293)


56-56: Single quotes found but double quotes preferred

Replace single quotes with double quotes

(Q000)


61-61: Blank line contains whitespace

Remove whitespace from blank line

(W293)


62-62: Single quotes found but double quotes preferred

Replace single quotes with double quotes

(Q000)


67-67: Blank line contains whitespace

Remove whitespace from blank line

(W293)


70-70: Blank line contains whitespace

Remove whitespace from blank line

(W293)


73-73: Blank line contains whitespace

Remove whitespace from blank line

(W293)

def __parse_player_transfer_history(self) -> list:
"""
Parse and retrieve the transfer history of the specified player from Transfermarkt,
Expand Down Expand Up @@ -53,10 +100,26 @@ def __parse_player_transfer_history(self) -> list:
"upcoming": transfer["upcoming"],
"season": transfer["season"],
"marketValue": transfer["marketValue"],
"fee": transfer["fee"],
**self.__process_fee_and_type(transfer["fee"]),
}
for transfer in transfers
]

def __process_fee_and_type(self, fee_value: str) -> dict:
"""
Process the fee value and determine the transfer type.

Args:
fee_value (str): The raw fee value from the transfer data.

Returns:
dict: A dictionary containing the cleaned fee value and transfer type.
"""
fee, transfer_type = self.__clean_html_value(fee_value)
return {
"fee": fee,
"transferType": transfer_type
}

def get_player_transfers(self) -> dict:
"""
Expand Down
6 changes: 5 additions & 1 deletion tests/players/test_players_transfers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest
from fastapi import HTTPException
from schema import And, Optional, Schema
from schema import And, Optional, Schema, Or

from app.services.players.transfers import TransfermarktPlayerTransfers

Expand Down Expand Up @@ -36,6 +36,7 @@ def test_get_player_transfers(player_id, len_greater_than_0, regex_integer, rege
"upcoming": bool,
Optional("marketValue"): And(str, len_greater_than_0, regex_market_value),
Optional("fee"): And(str, len_greater_than_0),
"transferType": Or("permanent", "loan", "end_of_loan", "free_transfer"),
},
],
"youthClubs": list,
Expand All @@ -46,3 +47,6 @@ def test_get_player_transfers(player_id, len_greater_than_0, regex_integer, rege
assert expected_schema.validate(result)
assert any("marketValue" in stat for stat in result.get("transfers"))
assert any("fee" in stat for stat in result.get("transfers"))

transfer_types = [transfer.get("transferType") for transfer in result.get("transfers")]
assert all(transfer_types), "All transfers should have a transferType"