feat: Improve title extraction logic and add comprehensive test cases for edge cases
This commit is contained in:
@@ -19,15 +19,65 @@ class FilenameExtractor:
|
||||
|
||||
def extract_title(self) -> str | None:
|
||||
"""Extract movie title from filename"""
|
||||
temp_name = re.sub(r'\s*\(\d{4}\)\s*|\s*\d{4}\s*|\.\d{4}\.', '', self.file_name)
|
||||
|
||||
# Find and remove source
|
||||
# Find positions of year, source, and quality brackets
|
||||
year_pos = -1
|
||||
source_pos = -1
|
||||
quality_pos = -1
|
||||
paren_match = None
|
||||
dot_match = None
|
||||
|
||||
# Find year position (either (YYYY) or .YYYY.)
|
||||
paren_match = re.search(r'\((\d{4})\)', self.file_name)
|
||||
if paren_match:
|
||||
year_pos = paren_match.start()
|
||||
else:
|
||||
dot_match = re.search(r'\.(\d{4})\.', self.file_name)
|
||||
if dot_match:
|
||||
year_pos = dot_match.start()
|
||||
|
||||
# Find source position
|
||||
source = self.extract_source()
|
||||
if source:
|
||||
for alias in SOURCE_DICT[source]:
|
||||
temp_name = re.sub(r'\b' + re.escape(alias) + r'\b', '', temp_name, flags=re.IGNORECASE)
|
||||
|
||||
return temp_name.rsplit('.', 1)[0].strip()
|
||||
match = re.search(r'\b' + re.escape(alias) + r'\b', self.file_name, re.IGNORECASE)
|
||||
if match:
|
||||
source_pos = match.start()
|
||||
break
|
||||
|
||||
# Find quality bracket position (like [720p,ukr,eng])
|
||||
quality_match = re.search(r'\[[^\]]*(?:720p|1080p|2160p|480p|SD|HD|HDR)[^\]]*\]', self.file_name)
|
||||
if quality_match:
|
||||
quality_pos = quality_match.start()
|
||||
|
||||
# Find the earliest position that's not at the beginning
|
||||
positions = [pos for pos in [year_pos, source_pos, quality_pos] if pos > 0]
|
||||
cut_pos = min(positions) if positions else -1
|
||||
|
||||
# Extract title (everything before the cut position)
|
||||
if cut_pos > 0:
|
||||
title = self.file_name[:cut_pos].strip()
|
||||
else:
|
||||
# No delimiters found after position 0, take everything before the last dot
|
||||
title = self.file_name.rsplit('.', 1)[0].strip()
|
||||
|
||||
# If year is at the beginning, remove it
|
||||
if year_pos == 0:
|
||||
if paren_match and paren_match.start() == 0:
|
||||
title = re.sub(r'^\(\d{4}\)\s*', '', title)
|
||||
elif dot_match and dot_match.start() == 0:
|
||||
title = re.sub(r'^\.\d{4}\.\s*', '', title)
|
||||
|
||||
# Remove common prefixes that are not part of the title
|
||||
# Remove bracketed prefixes like [01.1], [1], etc.
|
||||
title = re.sub(r'^\s*\[[^\]]+\]\s*', '', title)
|
||||
|
||||
# Clean up title: remove leading/trailing brackets and dots
|
||||
title = title.strip('[](). ')
|
||||
|
||||
# Replace colons with periods in the title
|
||||
title = title.replace(':', '.')
|
||||
|
||||
return title if title else None
|
||||
|
||||
def extract_year(self) -> str | None:
|
||||
"""Extract year from filename"""
|
||||
|
||||
@@ -210,3 +210,48 @@ The long title (2008) UHD 1440 ENG.mp4
|
||||
The long title (2008) 8K 4320p ENG.mp4
|
||||
Troll 2 (2025) WEB-DL 2160p HDR Ukr Nor [Hurtom].mkv
|
||||
Moana 2 (2024) MA WEB-DL 2160p SDR Ukr Eng [Hurtom].mkv
|
||||
|
||||
# Test cases for title extraction with various edge cases
|
||||
2001 A Space Odyssey (1968) [720p,ukr,eng].mkv
|
||||
The 100 (2014) Season 1 Episode 1 [720p,ukr].mkv
|
||||
[2024] Dune Part Two (2024) [2160p,HDR,ukr,eng].mkv
|
||||
Star Wars Episode IV - A New Hope (1977) [1080p,ukr,eng].mkv
|
||||
The Lord of the Rings 2001 Extended Edition (2001) BDRip 1080p [ukr,eng].mkv
|
||||
Matrix 1999 (1999) [720p,ukr].mkv
|
||||
(2023) Talk to Me [720p,ukr,eng].mkv
|
||||
Avatar The Way of Water (2022) [2160p,HDR,ukr,eng].mkv
|
||||
Guardians of the Galaxy Vol. 3 (2023) [1080p,ukr,eng].mkv
|
||||
Spider-Man No Way Home (2021) [2160p,HDR,ukr,eng].mkv
|
||||
The Batman (2022) [1080p,ukr,eng].mkv
|
||||
Oppenheimer (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Barbie (2023) [1080p,ukr,eng].mkv
|
||||
Wonka (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Aquaman and the Lost Kingdom (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Migration (2023) [1080p,ukr,eng].mkv
|
||||
The Holdovers (2023) [1080p,ukr,eng].mkv
|
||||
Killers of the Flower Moon (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Poor Things (2023) [1080p,ukr,eng].mkv
|
||||
Anatomy of a Fall (2023) [720p,ukr,eng].mkv
|
||||
|
||||
|
||||
# Test cases for title extraction with various edge cases
|
||||
2001 A Space Odyssey (1968) [720p,ukr,eng].mkv
|
||||
The 100 (2014) Season 1 Episode 1 [720p,ukr].mkv
|
||||
[2024] Dune Part Two (2024) [2160p,HDR,ukr,eng].mkv
|
||||
Star Wars Episode IV - A New Hope (1977) [1080p,ukr,eng].mkv
|
||||
The Lord of the Rings 2001 Extended Edition (2001) BDRip 1080p [ukr,eng].mkv
|
||||
Matrix 1999 (1999) [720p,ukr].mkv
|
||||
(2023) Talk to Me [720p,ukr,eng].mkv
|
||||
Avatar The Way of Water (2022) [2160p,HDR,ukr,eng].mkv
|
||||
Guardians of the Galaxy Vol. 3 (2023) [1080p,ukr,eng].mkv
|
||||
Spider-Man No Way Home (2021) [2160p,HDR,ukr,eng].mkv
|
||||
The Batman (2022) [1080p,ukr,eng].mkv
|
||||
Oppenheimer (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Barbie (2023) [1080p,ukr,eng].mkv
|
||||
Wonka (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Aquaman and the Lost Kingdom (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Migration (2023) [1080p,ukr,eng].mkv
|
||||
The Holdovers (2023) [1080p,ukr,eng].mkv
|
||||
Killers of the Flower Moon (2023) [2160p,HDR,ukr,eng].mkv
|
||||
Poor Things (2023) [1080p,ukr,eng].mkv
|
||||
Anatomy of a Fall (2023) [720p,ukr,eng].mkv
|
||||
|
||||
Reference in New Issue
Block a user