import gzip
import re
from pathlib import Path

FILE = Path(r"Z:\TvGuide\epg.xml.gz")

with gzip.open(FILE, "rt", encoding="utf-8", errors="replace") as f:
    xml = f.read()

# Удаляем запрещённые управляющие символы
xml = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", xml)

# Убираем переносы/табы внутри title
def fix_title(m):
    attrs = m.group(1)
    title = m.group(2)

    title = re.sub(r"[\r\n\t]+", " ", title)
    title = re.sub(r"\s+", " ", title).strip()

    return f"<title{attrs}>{title}</title>"

xml = re.sub(
    r"<title([^>]*)>(.*?)</title>",
    fix_title,
    xml,
    flags=re.S
)

with gzip.open(FILE, "wt", encoding="utf-8") as f:
    f.write(xml)

print("Готово")
print("Исправлен файл:", FILE)

# Проверка после исправления
with gzip.open(FILE, "rt", encoding="utf-8", errors="replace") as f:
    check = f.read()

bad_titles = []

for m in re.finditer(r"<title[^>]*>(.*?)</title>", check, re.S):
    title = m.group(1)
    if "\n" in title or "\r" in title:
        bad_titles.append(title)

print("Title с переносами:", len(bad_titles))

if bad_titles:
    for t in bad_titles:
        print(repr(t))