From 38e3f0e55334732e15e117fecc5ed58de941d004 Mon Sep 17 00:00:00 2001 From: sHa Date: Fri, 26 Dec 2025 14:06:40 +0000 Subject: [PATCH] feat: Improve title extraction logic and add comprehensive test cases for edge cases --- renamer/extractors/filename_extractor.py | 62 +++++++++++++++++++++--- renamer/test/filenames.txt | 45 +++++++++++++++++ 2 files changed, 101 insertions(+), 6 deletions(-) diff --git a/renamer/extractors/filename_extractor.py b/renamer/extractors/filename_extractor.py index 106fbef..7f39e58 100644 --- a/renamer/extractors/filename_extractor.py +++ b/renamer/extractors/filename_extractor.py @@ -19,15 +19,65 @@ class FilenameExtractor: def extract_title(self) -> str | None: """Extract movie title from filename""" - temp_name = re.sub(r'\s*\(\d{4}\)\s*|\s*\d{4}\s*|\.\d{4}\.', '', self.file_name) - - # Find and remove source + # Find positions of year, source, and quality brackets + year_pos = -1 + source_pos = -1 + quality_pos = -1 + paren_match = None + dot_match = None + + # Find year position (either (YYYY) or .YYYY.) + paren_match = re.search(r'\((\d{4})\)', self.file_name) + if paren_match: + year_pos = paren_match.start() + else: + dot_match = re.search(r'\.(\d{4})\.', self.file_name) + if dot_match: + year_pos = dot_match.start() + + # Find source position source = self.extract_source() if source: for alias in SOURCE_DICT[source]: - temp_name = re.sub(r'\b' + re.escape(alias) + r'\b', '', temp_name, flags=re.IGNORECASE) - - return temp_name.rsplit('.', 1)[0].strip() + match = re.search(r'\b' + re.escape(alias) + r'\b', self.file_name, re.IGNORECASE) + if match: + source_pos = match.start() + break + + # Find quality bracket position (like [720p,ukr,eng]) + quality_match = re.search(r'\[[^\]]*(?:720p|1080p|2160p|480p|SD|HD|HDR)[^\]]*\]', self.file_name) + if quality_match: + quality_pos = quality_match.start() + + # Find the earliest position that's not at the beginning + positions = [pos for pos in [year_pos, source_pos, quality_pos] if pos > 0] + cut_pos = min(positions) if positions else -1 + + # Extract title (everything before the cut position) + if cut_pos > 0: + title = self.file_name[:cut_pos].strip() + else: + # No delimiters found after position 0, take everything before the last dot + title = self.file_name.rsplit('.', 1)[0].strip() + + # If year is at the beginning, remove it + if year_pos == 0: + if paren_match and paren_match.start() == 0: + title = re.sub(r'^\(\d{4}\)\s*', '', title) + elif dot_match and dot_match.start() == 0: + title = re.sub(r'^\.\d{4}\.\s*', '', title) + + # Remove common prefixes that are not part of the title + # Remove bracketed prefixes like [01.1], [1], etc. + title = re.sub(r'^\s*\[[^\]]+\]\s*', '', title) + + # Clean up title: remove leading/trailing brackets and dots + title = title.strip('[](). ') + + # Replace colons with periods in the title + title = title.replace(':', '.') + + return title if title else None def extract_year(self) -> str | None: """Extract year from filename""" diff --git a/renamer/test/filenames.txt b/renamer/test/filenames.txt index 22ce6a8..15cd236 100644 --- a/renamer/test/filenames.txt +++ b/renamer/test/filenames.txt @@ -210,3 +210,48 @@ The long title (2008) UHD 1440 ENG.mp4 The long title (2008) 8K 4320p ENG.mp4 Troll 2 (2025) WEB-DL 2160p HDR Ukr Nor [Hurtom].mkv Moana 2 (2024) MA WEB-DL 2160p SDR Ukr Eng [Hurtom].mkv + +# Test cases for title extraction with various edge cases +2001 A Space Odyssey (1968) [720p,ukr,eng].mkv +The 100 (2014) Season 1 Episode 1 [720p,ukr].mkv +[2024] Dune Part Two (2024) [2160p,HDR,ukr,eng].mkv +Star Wars Episode IV - A New Hope (1977) [1080p,ukr,eng].mkv +The Lord of the Rings 2001 Extended Edition (2001) BDRip 1080p [ukr,eng].mkv +Matrix 1999 (1999) [720p,ukr].mkv +(2023) Talk to Me [720p,ukr,eng].mkv +Avatar The Way of Water (2022) [2160p,HDR,ukr,eng].mkv +Guardians of the Galaxy Vol. 3 (2023) [1080p,ukr,eng].mkv +Spider-Man No Way Home (2021) [2160p,HDR,ukr,eng].mkv +The Batman (2022) [1080p,ukr,eng].mkv +Oppenheimer (2023) [2160p,HDR,ukr,eng].mkv +Barbie (2023) [1080p,ukr,eng].mkv +Wonka (2023) [2160p,HDR,ukr,eng].mkv +Aquaman and the Lost Kingdom (2023) [2160p,HDR,ukr,eng].mkv +Migration (2023) [1080p,ukr,eng].mkv +The Holdovers (2023) [1080p,ukr,eng].mkv +Killers of the Flower Moon (2023) [2160p,HDR,ukr,eng].mkv +Poor Things (2023) [1080p,ukr,eng].mkv +Anatomy of a Fall (2023) [720p,ukr,eng].mkv + + +# Test cases for title extraction with various edge cases +2001 A Space Odyssey (1968) [720p,ukr,eng].mkv +The 100 (2014) Season 1 Episode 1 [720p,ukr].mkv +[2024] Dune Part Two (2024) [2160p,HDR,ukr,eng].mkv +Star Wars Episode IV - A New Hope (1977) [1080p,ukr,eng].mkv +The Lord of the Rings 2001 Extended Edition (2001) BDRip 1080p [ukr,eng].mkv +Matrix 1999 (1999) [720p,ukr].mkv +(2023) Talk to Me [720p,ukr,eng].mkv +Avatar The Way of Water (2022) [2160p,HDR,ukr,eng].mkv +Guardians of the Galaxy Vol. 3 (2023) [1080p,ukr,eng].mkv +Spider-Man No Way Home (2021) [2160p,HDR,ukr,eng].mkv +The Batman (2022) [1080p,ukr,eng].mkv +Oppenheimer (2023) [2160p,HDR,ukr,eng].mkv +Barbie (2023) [1080p,ukr,eng].mkv +Wonka (2023) [2160p,HDR,ukr,eng].mkv +Aquaman and the Lost Kingdom (2023) [2160p,HDR,ukr,eng].mkv +Migration (2023) [1080p,ukr,eng].mkv +The Holdovers (2023) [1080p,ukr,eng].mkv +Killers of the Flower Moon (2023) [2160p,HDR,ukr,eng].mkv +Poor Things (2023) [1080p,ukr,eng].mkv +Anatomy of a Fall (2023) [720p,ukr,eng].mkv