FMN_bot/src/imdb_datasets_worker.py

import sqlite3
import gzip
from loguru import logger

conn = sqlite3.connect("imdb_titles.sqlite")
c = conn.cursor()

c.execute(
    '''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
c.execute("PRAGMA synchronous = OFF")
c.execute("PRAGMA optimize")
conn.commit()


def convert_tsv_to_db(title_basics_tsv):
    '''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
    with gzip.open(title_basics_tsv, mode='rt') as file:
        write_dataset = []
        counter = 0
        chunk = 1000
        progress_counter = 0
        for line in file:
            line = line.split("\t")
            try:
                tt_id = int(line[0].split("tt")[1])
                tt_type = line[1]
                original_name = line[3]
                ru_name = None
                year = line[5]
                genres = line[-1].strip().split(',')
                if year.startswith(r"\N"):
                    year = None
                else:
                    year = int(year)
                    
                if tt_type not in ("movie", "tvMovie", "video"):
                    original_name = None
                    year = None
                if "Documentary" in genres:
                    logger.debug(f'Документальный {original_name} отсеян')
                    original_name = None
                    year = None
                    tt_type = "doc"
                write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
                counter += 1
                if counter >= chunk:
                    c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
                    write_dataset = []
                    counter = 0
                    progress_counter += chunk
                    logger.info(f'Обработано: {progress_counter}')
            except:
                logger.exception('Err')
                pass
        conn.commit()

def extract_ru_locale_from_tsv(title_akas_tsv):
    '''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
    with gzip.open(title_akas_tsv, mode='rt') as file:
        ru_name_writer = []
        counter = 0
        for line in file:
            line = line.split("\t")
            try:
                tt_region = line[3]
                if tt_region != "RU": 
                    continue

                tt_id = int(line[0].split("tt")[1])
                tt_type = c.execute("SELECT type FROM titles WHERE tt_id = (?)", (tt_id, )).fetchone()[0]
                if tt_type not in ("movie", "tvMovie", "video"):
                    continue
                ru_name = line[2]
                ru_name_writer.append((ru_name, tt_id))
                counter += 1
                logger.info(f'Обработано ru_name: {counter}')

            except:
                logger.exception('Err')
                pass
        
        c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
        conn.commit()

def convert_datasets_to_db():
    print("Converting tsv dataset to sqlite...")
    convert_tsv_to_db("title.basics.tsv.gz")
    print("Unpack ru locale...")
    extract_ru_locale_from_tsv("title.akas.tsv.gz")


def get_title_by_id(films_ids=list):
    tt_list = []
    for i in films_ids:
        tt_film = c.execute("SELECT * FROM titles WHERE tt_id = (?)", (i,)).fetchone()
        if tt_film:
            tt_list.append(tt_film)
    if tt_list != []:
        return tt_list


def get_title_by_names_and_year(film_names=list):
    tt_list = []
    for i in film_names:
        tt_film = c.execute('''SELECT * FROM titles 
                WHERE (original_name = (?) OR ru_name = (?))
                AND year = (?)''', i).fetchone()
        if tt_film:
            tt_list.append(tt_film)
    if tt_list != []:
        return tt_list