feat: Enhance FilenameExtractor to support various filename formats and add comprehensive test cases

This commit is contained in:
sHa
2025-12-28 14:07:06 +00:00
parent 2237f8515c
commit ad78abe30b
11 changed files with 518 additions and 23 deletions

View File

@@ -8,7 +8,11 @@ import langcodes
class FilenameExtractor:
"""Class to extract information from filename"""
def __init__(self, file_path: Path):
def __init__(self, file_path: Path | str):
if isinstance(file_path, str):
self.file_path = Path(file_path)
self.file_name = file_path
else:
self.file_path = file_path
self.file_name = file_path.name
@@ -47,6 +51,15 @@ class FilenameExtractor:
dot_match = re.search(r'\.(\d{4})\.', self.file_name)
if dot_match:
year_pos = dot_match.start()
else:
# Last resort: any 4-digit number
any_match = re.search(r'\b(\d{4})\b', self.file_name)
if any_match:
year = any_match.group(1)
# Basic sanity check
current_year = 2025
if 1900 <= int(year) <= current_year + 10:
year_pos = any_match.start() # Cut before the year for plain years
# Find source position
source = self.extract_source()
@@ -85,8 +98,19 @@ class FilenameExtractor:
title = re.sub(r'^\s*\[[^\]]+\]\s*', '', title)
# Remove order number prefixes like 01., 1., 1.1 followed by space/underscore
title = re.sub(r'^\s*(\d+(?:\.\d+)?)\.(?=\s|_|$)', '', title)
title = re.sub(r'^\s*(\d+(?:\.\d+)?)(?=\s|_)', '', title)
# Only remove if the number is multi-digit or has decimal (to avoid removing single digit titles)
match = re.match(r'^\s*(\d+(?:\.\d+)?)\.(?=\s|_)', title)
if match:
order = match.group(1)
if len(order) > 1 or '.' in order:
title = re.sub(r'^\s*(\d+(?:\.\d+)?)\.(?=\s|_)', '', title)
# Remove order like 1.9 where 1 is order, 9 is title
order = self.extract_order()
if order:
match = re.match(r'^' + re.escape(order) + r'\.(.+)', title)
if match:
title = match.group(1)
# Clean up any remaining leading separators
title = title.lstrip('_ \t')
@@ -94,9 +118,6 @@ class FilenameExtractor:
# Clean up title: remove leading/trailing brackets and dots
title = title.strip('[](). ')
# Replace colons with periods in the title
title = title.replace(':', '.')
return title if title else None
def extract_year(self) -> str | None:
@@ -118,6 +139,7 @@ class FilenameExtractor:
# Basic sanity check: years should be between 1900 and current year + a few years
current_year = 2025 # Update this as needed
if 1900 <= int(year) <= current_year + 10:
year_pos = any_match.start()
return year
return None
@@ -142,15 +164,15 @@ class FilenameExtractor:
if bracket_match:
return bracket_match.group(1)
# Check for dot patterns: 01., 1., 1.1 followed by space, underscore, or end of string
dot_match = re.match(r'^(\d+(?:\.\d+)?)\.(?=\s|_|$)', self.file_name)
if dot_match:
return dot_match.group(1)
# Check for number followed by space or underscore (like "1.1 " at start)
space_match = re.match(r'^(\d+(?:\.\d+)?)(?=\s|_)', self.file_name)
if space_match:
return space_match.group(1)
# Check for dot patterns: 01., 1., 1.1 followed by title before (
dot_match = re.match(r'^(\d+(?:\.\d)*)\.?\s*', self.file_name)
if dot_match and '.' in dot_match.group(0):
order = dot_match.group(1)
if '.' in order:
parts = order.split('.')
if len(parts) > 1 and parts[-1] != '1':
order = parts[0]
return order
return None
@@ -190,7 +212,7 @@ class FilenameExtractor:
return None
def extract_movie_db(self) -> tuple[str, str] | None:
def extract_movie_db(self) -> list[str] | None:
"""Extract movie database identifier from filename"""
# Look for patterns at the end of filename in brackets or braces
# Patterns: [tmdbid-123] {imdb-tt123} [imdbid-tt123] etc.
@@ -207,11 +229,11 @@ class FilenameExtractor:
db_type_lower = db_type.lower()
for db_key, db_info in MOVIE_DB_DICT.items():
if any(db_type_lower.startswith(pattern.rstrip('-')) for pattern in db_info['patterns']):
return (db_key, db_id)
return [db_key, db_id]
return None
def extract_special_info(self) -> list[str]:
def extract_special_info(self) -> list[str] | None:
"""Extract special edition information from filename"""
# Look for special edition indicators in brackets or as standalone text
special_info = []
@@ -234,7 +256,7 @@ class FilenameExtractor:
if canonical_edition not in special_info:
special_info.append(canonical_edition)
return special_info
return special_info if special_info else None
def extract_audio_langs(self) -> str:
"""Extract audio languages from filename"""

View File

@@ -0,0 +1,342 @@
[
{
"testname": "test-001",
"filename": "Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
}
},
{
"testname": "test-002",
"filename": "[01] Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": "01",
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
}
},
{
"filename": "01. Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": "01",
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-003"
},
{
"filename": "1.1. Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": "1.1",
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-004"
},
{
"filename": "1.9 Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": "1",
"title": "9 Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-005"
},
{
"filename": "9 (2009) BDRip [1080p,2ukr,eng].mkv",
"expected": {
"order": null,
"title": "9",
"year": "2009",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "2ukr,eng"
},
"testname": "test-006"
},
{
"filename": "Movie Title 2020 BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-007"
},
{
"filename": "Movie Title.2020.BDRip.[1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-008"
},
{
"filename": "Movie Title BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": null,
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-009"
},
{
"filename": "Series Name S01E01 (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Series Name S01E01",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-010"
},
{
"filename": "Movie Title (2020) [tmdbid-12345].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": null,
"frame_class": null,
"hdr": null,
"movie_db": [
"tmdb",
"12345"
],
"special_info": null,
"audio_langs": ""
},
"testname": "test-011"
},
{
"filename": "Movie Title (2020) [Director's Cut] BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": [
"Director's Cut"
],
"audio_langs": "ukr,eng"
},
"testname": "test-012"
},
{
"filename": "\u0424\u0456\u043b\u044c\u043c \u041d\u0430\u0437\u0432\u0430 (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "\u0424\u0456\u043b\u044c\u043c \u041d\u0430\u0437\u0432\u0430",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-013"
},
{
"filename": "Movie Title (2020) 1080p BDRip [ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-014"
},
{
"filename": "Movie Title (2020) BDRip [2160p,HDR,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "2160p",
"hdr": "HDR",
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-015"
},
{
"filename": "Movie Title (2020) BDRip [1080p,2ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "2ukr,eng"
},
"testname": "test-016"
},
{
"filename": "Movie.Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "Movie.Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-017"
},
{
"filename": "Movie Title (2020) BDRip 1080p ukr eng.mkv",
"expected": {
"order": null,
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-018"
},
{
"filename": "[01.1] Movie: Subtitle (2020) [Director's Cut] BDRip [2160p,HDR,2ukr,eng] [tmdbid-12345].mkv",
"expected": {
"order": "01.1",
"title": "Movie: Subtitle",
"year": "2020",
"source": "BDRip",
"frame_class": "2160p",
"hdr": "HDR",
"movie_db": [
"tmdb",
"12345"
],
"special_info": [
"Director's Cut"
],
"audio_langs": "2ukr,eng"
},
"testname": "test-019"
},
{
"filename": "1.9 (2009) BDRip [1080p,2ukr,eng].mkv",
"expected": {
"order": "1",
"title": "9",
"year": "2009",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "2ukr,eng"
},
"testname": "test-020"
},
{
"filename": "1 2009 BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": null,
"title": "1",
"year": "2009",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-021"
},
{
"filename": "9. Movie Title (2020) BDRip [1080p,ukr,eng].mkv",
"expected": {
"order": "9",
"title": "Movie Title",
"year": "2020",
"source": "BDRip",
"frame_class": "1080p",
"hdr": null,
"movie_db": null,
"special_info": null,
"audio_langs": "ukr,eng"
},
"testname": "test-022"
}
]

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""Test script for filename metadata detection with assertions"""
import sys
import os
import json
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from renamer.extractors.filename_extractor import FilenameExtractor
def test_detection():
with open('renamer/test/test_cases.json', 'r') as f:
test_cases = json.load(f)
print("Testing filename metadata detection with assertions...\n")
passed = 0
failed = 0
for i, case in enumerate(test_cases, 1):
filename = case['filename']
expected = case['expected']
testname = case.get('testname', f'Test {i}')
print(f"{testname}: {filename}")
extractor = FilenameExtractor(filename)
actual = {
"order": extractor.extract_order(),
"title": extractor.extract_title(),
"year": extractor.extract_year(),
"source": extractor.extract_source(),
"frame_class": extractor.extract_frame_class(),
"hdr": extractor.extract_hdr(),
"movie_db": extractor.extract_movie_db(),
"special_info": extractor.extract_special_info(),
"audio_langs": extractor.extract_audio_langs()
}
# Check each field
test_passed = True
for key, exp_value in expected.items():
act_value = actual[key]
if act_value != exp_value:
print(f"{key}: expected {exp_value!r}, got {act_value!r}")
test_passed = False
else:
print(f"{key}: {act_value!r}")
if test_passed:
print(" ✅ PASSED\n")
passed += 1
else:
print(" ❌ FAILED\n")
failed += 1
print(f"Results: {passed} passed, {failed} failed")
return failed == 0
if __name__ == '__main__':
success = test_detection()
sys.exit(0 if success else 1)

View File

@@ -92,9 +92,9 @@ def test_extract_movie_db(filename):
# Print filename and extracted movie DB clearly
print(f"\nFilename: \033[1;36m{filename}\033[0m")
print(f"Extracted movie DB: \033[1;32m{movie_db}\033[0m")
# Movie DB should be tuple (str, str) or None
# Movie DB should be list [str, str] or None
if movie_db:
assert isinstance(movie_db, tuple) and len(movie_db) == 2
assert isinstance(movie_db, list) and len(movie_db) == 2
assert isinstance(movie_db[0], str) and isinstance(movie_db[1], str)
else:
assert movie_db is None

View File

@@ -0,0 +1,68 @@
# Test filenames for data extraction
# Each line is a filename to test extraction of: order, title, year, source, frame_class, hdr, movie_db, special_info, audio_langs
# Standard movie
Movie Title (2020) BDRip [1080p,ukr,eng].mkv
# With order in brackets
[01] Movie Title (2020) BDRip [1080p,ukr,eng].mkv
# With order dot
01. Movie Title (2020) BDRip [1080p,ukr,eng].mkv
# Order with decimal
1.1 Movie Title (2020) BDRip [1080p,ukr,eng].mkv
# Order like 1.9 (order 1, title 9...)
1.9 Movie Title (2020) BDRip [1080p,ukr,eng].mkv
# Title with number (no order)
9 (2009) BDRip [1080p,2ukr,eng].mkv
# Year not in parentheses
Movie Title 2020 BDRip [1080p,ukr,eng].mkv
# Year in dots
Movie Title.2020.BDRip.[1080p,ukr,eng].mkv
# No year
Movie Title BDRip [1080p,ukr,eng].mkv
# Series
Series Name S01E01 (2020) BDRip [1080p,ukr,eng].mkv
# With TMDB ID
Movie Title (2020) [tmdbid-12345].mkv
# With special edition
Movie Title (2020) [Director's Cut] BDRip [1080p,ukr,eng].mkv
# Cyrillic title
Фільм Назва (2020) BDRip [1080p,ukr,eng].mkv
# Resolution in name
Movie Title (2020) 1080p BDRip [ukr,eng].mkv
# HDR
Movie Title (2020) BDRip [2160p,HDR,ukr,eng].mkv
# Multiple audio
Movie Title (2020) BDRip [1080p,2ukr,eng].mkv
# Title with dots
Movie.Title (2020) BDRip [1080p,ukr,eng].mkv
# No brackets
Movie Title (2020) BDRip 1080p ukr eng.mkv
# Complex
[01.1] Movie: Subtitle (2020) [Director's Cut] BDRip [2160p,HDR,2ukr,eng] [tmdbid-12345].mkv
# Order at start with dot and year
1.9 (2009) BDRip [1080p,2ukr,eng].mkv
# Order at start with space and year
1 2009 BDRip [1080p,ukr,eng].mkv
# Title starting with number dot
9. Movie Title (2020) BDRip [1080p,ukr,eng].mkv