feat: Add audio languages and tracks extraction from filename, with corresponding tests
This commit is contained in:
@@ -45,7 +45,8 @@ class MediaExtractor:
|
||||
('Filename', lambda: self.filename_extractor.extract_movie_db())
|
||||
],
|
||||
'audio_langs': [
|
||||
('MediaInfo', lambda: self.mediainfo_extractor.extract_audio_langs())
|
||||
('MediaInfo', lambda: self.mediainfo_extractor.extract_audio_langs()),
|
||||
('Filename', lambda: self.filename_extractor.extract_audio_langs())
|
||||
],
|
||||
'meta_type': [
|
||||
('Metadata', lambda: self.metadata_extractor.extract_meta_type())
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
from ..constants import SOURCE_DICT, FRAME_CLASSES, MOVIE_DB_DICT
|
||||
import langcodes
|
||||
|
||||
|
||||
class FilenameExtractor:
|
||||
@@ -159,4 +161,247 @@ class FilenameExtractor:
|
||||
if any(db_type_lower.startswith(pattern.rstrip('-')) for pattern in db_info['patterns']):
|
||||
return (db_key, db_id)
|
||||
|
||||
return None
|
||||
return None
|
||||
|
||||
def extract_audio_langs(self) -> str:
|
||||
"""Extract audio languages from filename"""
|
||||
# Look for language patterns in brackets and outside brackets
|
||||
# Skip subtitle indicators and focus on audio languages
|
||||
|
||||
langs = []
|
||||
|
||||
# First, look for languages inside brackets
|
||||
bracket_pattern = r'\[([^\]]+)\]'
|
||||
brackets = re.findall(bracket_pattern, self.file_name)
|
||||
|
||||
for bracket in brackets:
|
||||
bracket_lower = bracket.lower()
|
||||
|
||||
# Skip brackets that contain movie database patterns
|
||||
if any(db in bracket_lower for db in ['imdb', 'tmdb', 'tvdb']):
|
||||
continue
|
||||
|
||||
# Parse items separated by commas or underscores
|
||||
items = re.split(r'[,_]', bracket)
|
||||
items = [item.strip() for item in items]
|
||||
|
||||
for item in items:
|
||||
# Skip empty items or items that are clearly not languages
|
||||
if not item or len(item) < 2:
|
||||
continue
|
||||
|
||||
item_lower = item.lower()
|
||||
|
||||
# Skip subtitle indicators
|
||||
if item_lower in ['sub', 'subs', 'subtitle']:
|
||||
continue
|
||||
|
||||
# Check if item contains language codes (2-3 letter codes)
|
||||
# Pattern: optional number + optional 'x' + language code
|
||||
# Allow the language code to be at the end of the item
|
||||
lang_match = re.search(r'(?:(\d+)x?)?([a-z]{2,3})$', item_lower)
|
||||
if lang_match:
|
||||
count = int(lang_match.group(1)) if lang_match.group(1) else 1
|
||||
lang_code = lang_match.group(2)
|
||||
|
||||
# Skip if it's a quality/resolution indicator
|
||||
if lang_code in ['sd', 'hd', 'lq', 'qhd', 'uhd', 'p', 'i', 'hdr', 'sdr']:
|
||||
continue
|
||||
|
||||
# Skip if the language code is not at the end or if there are extra letters after
|
||||
# But allow prefixes like numbers and 'x'
|
||||
prefix = item_lower[:-len(lang_code)]
|
||||
if not re.match(r'^(?:\d+x?)?$', prefix):
|
||||
continue
|
||||
|
||||
# Convert to 3-letter ISO code
|
||||
try:
|
||||
lang_obj = langcodes.Language.get(lang_code)
|
||||
iso3_code = lang_obj.to_alpha3()
|
||||
langs.extend([iso3_code] * count)
|
||||
except:
|
||||
# Skip invalid language codes
|
||||
pass
|
||||
|
||||
# Second, look for standalone language codes outside brackets
|
||||
# Remove bracketed content first
|
||||
text_without_brackets = re.sub(r'\[([^\]]+)\]', '', self.file_name)
|
||||
|
||||
# Known language codes (2-3 letter ISO 639-1 or 639-3)
|
||||
known_language_codes = {
|
||||
'eng', 'ukr', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'nor', 'swe', 'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und',
|
||||
'dut', 'nld', 'bel', 'bul', 'hrv', 'ces', 'dan', 'nld', 'est', 'fin', 'fra', 'deu', 'ell', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'lav', 'lit', 'mkd', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'und', 'zho',
|
||||
'arb', 'ben', 'hin', 'mar', 'tam', 'tel', 'urd', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'nep', 'sin', 'san', 'tib', 'mon', 'kaz', 'uzb', 'kir', 'tuk', 'aze', 'kat', 'hye', 'geo', 'ell', 'sqi', 'bos', 'hrv', 'srp', 'slv', 'mkd', 'bul', 'alb', 'ron', 'mol', 'hun',
|
||||
'fin', 'swe', 'nor', 'dan', 'isl', 'fao', 'est', 'lav', 'lit', 'bel', 'ukr', 'rus', 'pol', 'cze', 'slk', 'slv', 'hrv', 'bos', 'srp', 'mkd', 'bul', 'ell', 'alb', 'ron', 'hun', 'tur', 'aze', 'geo', 'arm', 'kat', 'hye', 'per', 'kur', 'pus', 'urd', 'ara', 'heb', 'san', 'hin', 'ben', 'tam', 'tel', 'mar', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'awa', 'mag', 'nep', 'sin', 'div', 'tib', 'mon', 'kaz', 'kir', 'tuk', 'uzb', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und', 'lao', 'khm', 'mya', 'vie', 'und', 'ind', 'msa', 'zho', 'yue', 'wuu', 'nan', 'hak', 'gan', 'hsn',
|
||||
'spa', 'por', 'fra', 'ita', 'deu', 'nld', 'dut', 'swe', 'nor', 'dan', 'fin', 'est', 'lav', 'lit', 'pol', 'cze', 'slk', 'slv', 'hrv', 'bos', 'srp', 'mkd', 'bul', 'ell', 'alb', 'ron', 'hun', 'tur', 'aze', 'geo', 'arm', 'kat', 'hye', 'per', 'kur', 'pus', 'urd', 'ara', 'heb', 'san', 'hin', 'ben', 'tam', 'tel', 'mar', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'awa', 'mag', 'nep', 'sin', 'div', 'tib', 'mon', 'kaz', 'kir', 'tuk', 'uzb', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und', 'lao', 'khm', 'mya', 'vie', 'und', 'ind', 'msa', 'zho', 'yue', 'wuu', 'nan', 'hak', 'gan', 'hsn'
|
||||
}
|
||||
|
||||
allowed_title_case = {'ukr', 'nor', 'eng', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'swe', 'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und'}
|
||||
# Look for language codes in various formats:
|
||||
# - Uppercase: ENG, UKR, NOR
|
||||
# - Title case: Ukr, Nor, Eng
|
||||
# - Lowercase: ukr, nor, eng
|
||||
# - In dot-separated parts: .ukr. .eng.
|
||||
|
||||
# Split on dots, spaces, and underscores
|
||||
parts = re.split(r'[.\s_]+', text_without_brackets)
|
||||
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if not part or len(part) < 2:
|
||||
continue
|
||||
|
||||
part_lower = part.lower()
|
||||
|
||||
# Check if this part is a 2-3 letter language code
|
||||
if re.match(r'^[a-zA-Z]{2,3}$', part):
|
||||
# Skip title case 2-letter words to avoid false positives like "In" -> "ind"
|
||||
if part.istitle() and len(part) == 2:
|
||||
continue
|
||||
if part.istitle() and part_lower not in allowed_title_case:
|
||||
continue
|
||||
skip_words = [
|
||||
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'has', 'let', 'put', 'say', 'she', 'too', 'use',
|
||||
'avi', 'mkv', 'mp4', 'mpg', 'mov', 'wmv', 'flv', 'webm', 'm4v', 'm2ts', 'ts', 'vob', 'iso', 'img',
|
||||
'sd', 'hd', 'lq', 'qhd', 'uhd', 'p', 'i', 'hdr', 'sdr', '4k', '8k', '2160p', '1080p', '720p', '480p', '360p', '240p', '144p',
|
||||
'web', 'dl', 'rip', 'bluray', 'dvd', 'hdtv', 'bdrip', 'dvdrip', 'xvid', 'divx', 'h264', 'h265', 'x264', 'x265', 'hevc', 'avc',
|
||||
'ma', 'atmos', 'dts', 'aac', 'ac3', 'mp3', 'flac', 'wav', 'wma', 'ogg', 'opus'
|
||||
]
|
||||
|
||||
if part_lower not in skip_words and part_lower in known_language_codes:
|
||||
lang_code = part_lower
|
||||
|
||||
# Convert to 3-letter ISO code
|
||||
try:
|
||||
lang_obj = langcodes.Language.get(lang_code)
|
||||
iso3_code = lang_obj.to_alpha3()
|
||||
langs.append(iso3_code)
|
||||
except:
|
||||
# Skip invalid language codes
|
||||
pass
|
||||
|
||||
if not langs:
|
||||
return ''
|
||||
|
||||
# Count occurrences and format like mediainfo: "2ukr,eng"
|
||||
lang_counts = Counter(langs)
|
||||
audio_langs = [f"{count}{lang}" if count > 1 else lang for lang, count in lang_counts.items()]
|
||||
return ','.join(sorted(audio_langs))
|
||||
|
||||
def extract_audio_tracks(self) -> list[dict]:
|
||||
"""Extract audio track data from filename (simplified version with only language)"""
|
||||
# Similar to extract_audio_langs but returns list of dicts
|
||||
|
||||
tracks = []
|
||||
|
||||
# First, look for languages inside brackets
|
||||
bracket_pattern = r'\[([^\]]+)\]'
|
||||
brackets = re.findall(bracket_pattern, self.file_name)
|
||||
|
||||
for bracket in brackets:
|
||||
bracket_lower = bracket.lower()
|
||||
|
||||
# Skip brackets that contain movie database patterns
|
||||
if any(db in bracket_lower for db in ['imdb', 'tmdb', 'tvdb']):
|
||||
continue
|
||||
|
||||
# Parse items separated by commas or underscores
|
||||
items = re.split(r'[,_]', bracket)
|
||||
items = [item.strip() for item in items]
|
||||
|
||||
for item in items:
|
||||
# Skip empty items or items that are clearly not languages
|
||||
if not item or len(item) < 2:
|
||||
continue
|
||||
|
||||
item_lower = item.lower()
|
||||
|
||||
# Skip subtitle indicators
|
||||
if item_lower in ['sub', 'subs', 'subtitle']:
|
||||
continue
|
||||
|
||||
# Check if item contains language codes (2-3 letter codes)
|
||||
# Pattern: optional number + optional 'x' + language code
|
||||
# Allow the language code to be at the end of the item
|
||||
lang_match = re.search(r'(?:(\d+)x?)?([a-z]{2,3})$', item_lower)
|
||||
if lang_match:
|
||||
count = int(lang_match.group(1)) if lang_match.group(1) else 1
|
||||
lang_code = lang_match.group(2)
|
||||
|
||||
# Skip if it's a quality/resolution indicator
|
||||
if lang_code in ['sd', 'hd', 'lq', 'qhd', 'uhd', 'p', 'i', 'hdr', 'sdr']:
|
||||
continue
|
||||
|
||||
# Skip if the language code is not at the end or if there are extra letters after
|
||||
# But allow prefixes like numbers and 'x'
|
||||
prefix = item_lower[:-len(lang_code)]
|
||||
if not re.match(r'^(?:\d+x?)?$', prefix):
|
||||
continue
|
||||
|
||||
# Convert to 3-letter ISO code
|
||||
try:
|
||||
lang_obj = langcodes.Language.get(lang_code)
|
||||
iso3_code = lang_obj.to_alpha3()
|
||||
tracks.append({'language': iso3_code})
|
||||
except:
|
||||
# Skip invalid language codes
|
||||
pass
|
||||
|
||||
# Second, look for standalone language codes outside brackets
|
||||
# Remove bracketed content first
|
||||
text_without_brackets = re.sub(r'\[([^\]]+)\]', '', self.file_name)
|
||||
|
||||
# Known language codes (2-3 letter ISO 639-1 or 639-3)
|
||||
known_language_codes = {
|
||||
'eng', 'ukr', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'nor', 'swe', 'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und',
|
||||
'dut', 'nld', 'bel', 'bul', 'hrv', 'ces', 'dan', 'nld', 'est', 'fin', 'fra', 'deu', 'ell', 'heb', 'hin', 'hrv', 'hun', 'ind', 'ita', 'jpn', 'kor', 'lav', 'lit', 'mkd', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'und', 'zho',
|
||||
'arb', 'ben', 'hin', 'mar', 'tam', 'tel', 'urd', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'nep', 'sin', 'san', 'tib', 'mon', 'kaz', 'uzb', 'kir', 'tuk', 'aze', 'kat', 'hye', 'geo', 'ell', 'sqi', 'bos', 'hrv', 'srp', 'slv', 'mkd', 'bul', 'alb', 'ron', 'mol', 'hun',
|
||||
'fin', 'swe', 'nor', 'dan', 'isl', 'fao', 'est', 'lav', 'lit', 'bel', 'ukr', 'rus', 'pol', 'cze', 'slk', 'slv', 'hrv', 'bos', 'srp', 'mkd', 'bul', 'ell', 'alb', 'ron', 'hun', 'tur', 'aze', 'geo', 'arm', 'kat', 'hye', 'per', 'kur', 'pus', 'urd', 'ara', 'heb', 'san', 'hin', 'ben', 'tam', 'tel', 'mar', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'awa', 'mag', 'nep', 'sin', 'div', 'tib', 'mon', 'kaz', 'kir', 'tuk', 'uzb', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und', 'lao', 'khm', 'mya', 'vie', 'und', 'ind', 'msa', 'zho', 'yue', 'wuu', 'nan', 'hak', 'gan', 'hsn',
|
||||
'spa', 'por', 'fra', 'ita', 'deu', 'nld', 'dut', 'swe', 'nor', 'dan', 'fin', 'est', 'lav', 'lit', 'pol', 'cze', 'slk', 'slv', 'hrv', 'bos', 'srp', 'mkd', 'bul', 'ell', 'alb', 'ron', 'hun', 'tur', 'aze', 'geo', 'arm', 'kat', 'hye', 'per', 'kur', 'pus', 'urd', 'ara', 'heb', 'san', 'hin', 'ben', 'tam', 'tel', 'mar', 'guj', 'kan', 'mal', 'ori', 'pan', 'asm', 'mai', 'bho', 'awa', 'mag', 'nep', 'sin', 'div', 'tib', 'mon', 'kaz', 'kir', 'tuk', 'uzb', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und', 'lao', 'khm', 'mya', 'vie', 'und', 'ind', 'msa', 'zho', 'yue', 'wuu', 'nan', 'hak', 'gan', 'hsn'
|
||||
}
|
||||
allowed_title_case = {'ukr', 'nor', 'eng', 'rus', 'fra', 'deu', 'spa', 'ita', 'por', 'swe', 'dan', 'fin', 'pol', 'cze', 'hun', 'tur', 'ara', 'heb', 'hin', 'jpn', 'kor', 'chi', 'tha', 'vie', 'und'}
|
||||
|
||||
# Look for language codes in various formats:
|
||||
# - Uppercase: ENG, UKR, NOR
|
||||
# - Title case: Ukr, Nor, Eng
|
||||
# - Lowercase: ukr, nor, eng
|
||||
# - In dot-separated parts: .ukr. .eng.
|
||||
|
||||
# Split on dots, spaces, and underscores
|
||||
parts = re.split(r'[.\s_]+', text_without_brackets)
|
||||
|
||||
for part in parts:
|
||||
part = part.strip()
|
||||
if not part or len(part) < 2:
|
||||
continue
|
||||
|
||||
part_lower = part.lower()
|
||||
|
||||
# Check if this part is a 2-3 letter language code
|
||||
if re.match(r'^[a-zA-Z]{2,3}$', part):
|
||||
# Skip title case 2-letter words to avoid false positives like "In" -> "ind"
|
||||
if part.istitle() and len(part) == 2:
|
||||
continue
|
||||
if part.istitle() and part_lower not in allowed_title_case:
|
||||
continue
|
||||
skip_words = [
|
||||
'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'has', 'let', 'put', 'say', 'she', 'too', 'use',
|
||||
'avi', 'mkv', 'mp4', 'mpg', 'mov', 'wmv', 'flv', 'webm', 'm4v', 'm2ts', 'ts', 'vob', 'iso', 'img',
|
||||
'sd', 'hd', 'lq', 'qhd', 'uhd', 'p', 'i', 'hdr', 'sdr', '4k', '8k', '2160p', '1080p', '720p', '480p', '360p', '240p', '144p',
|
||||
'web', 'dl', 'rip', 'bluray', 'dvd', 'hdtv', 'bdrip', 'dvdrip', 'xvid', 'divx', 'h264', 'h265', 'x264', 'x265', 'hevc', 'avc',
|
||||
'ma', 'atmos', 'dts', 'aac', 'ac3', 'mp3', 'flac', 'wav', 'wma', 'ogg', 'opus'
|
||||
]
|
||||
|
||||
if part_lower not in skip_words and part_lower in known_language_codes:
|
||||
lang_code = part_lower
|
||||
|
||||
# Convert to 3-letter ISO code
|
||||
try:
|
||||
lang_obj = langcodes.Language.get(lang_code)
|
||||
iso3_code = lang_obj.to_alpha3()
|
||||
tracks.append({'language': iso3_code})
|
||||
except:
|
||||
# Skip invalid language codes
|
||||
pass
|
||||
|
||||
return tracks
|
||||
@@ -96,4 +96,33 @@ def test_extract_movie_db(filename):
|
||||
assert isinstance(movie_db, tuple) and len(movie_db) == 2
|
||||
assert isinstance(movie_db[0], str) and isinstance(movie_db[1], str)
|
||||
else:
|
||||
assert movie_db is None
|
||||
assert movie_db is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", load_test_filenames())
|
||||
def test_extract_audio_langs(filename):
|
||||
"""Test audio languages extraction from filename"""
|
||||
file_path = Path(filename)
|
||||
extractor = FilenameExtractor(file_path)
|
||||
audio_langs = extractor.extract_audio_langs()
|
||||
# Print filename and extracted audio languages clearly
|
||||
print(f"\nFilename: \033[1;36m{filename}\033[0m")
|
||||
print(f"Extracted audio langs: \033[1;32m{audio_langs}\033[0m")
|
||||
# Audio langs should be a string (possibly empty)
|
||||
assert isinstance(audio_langs, str)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", load_test_filenames())
|
||||
def test_extract_audio_tracks(filename):
|
||||
"""Test audio tracks extraction from filename"""
|
||||
file_path = Path(filename)
|
||||
extractor = FilenameExtractor(file_path)
|
||||
audio_tracks = extractor.extract_audio_tracks()
|
||||
# Print filename and extracted audio tracks clearly
|
||||
print(f"\nFilename: \033[1;36m{filename}\033[0m")
|
||||
print(f"Extracted audio tracks: \033[1;32m{audio_tracks}\033[0m")
|
||||
# Audio tracks should be a list of dicts
|
||||
assert isinstance(audio_tracks, list)
|
||||
for track in audio_tracks:
|
||||
assert isinstance(track, dict)
|
||||
assert 'language' in track
|
||||
Reference in New Issue
Block a user