import sqlite3 import gzip from loguru import logger conn = sqlite3.connect("imdb_titles.sqlite") c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''') c.execute("PRAGMA synchronous = OFF") c.execute("PRAGMA optimize") conn.commit() def convert_tsv_to_db(title_basics_tsv): '''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)''' with gzip.open(title_basics_tsv, mode='rt') as file: write_dataset = [] counter = 0 chunk = 1000 progress_counter = 0 for line in file: line = line.split("\t") try: tt_id = int(line[0].split("tt")[1]) tt_type = line[1] original_name = line[3] ru_name = None year = line[5] if year.startswith(r"\N"): year = None else: year = int(year) if tt_type not in ("movie", "tvMovie", "video"): original_name = None year = None write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) counter += 1 if counter >= chunk: c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) write_dataset = [] counter = 0 progress_counter += chunk logger.info(f'Обработано: {progress_counter}') except: logger.exception('Err') pass conn.commit() def extract_ru_locale_from_tsv(title_akas_tsv): '''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' with gzip.open(title_akas_tsv, mode='rt') as file: ru_name_writer = [] counter = 0 for line in file: line = line.split("\t") try: tt_region = line[3] if tt_region != "RU": continue tt_id = int(line[0].split("tt")[1]) tt_type = c.execute("SELECT type FROM titles WHERE tt_id = (?)", (tt_id, )).fetchone()[0] if tt_type not in ("movie", "tvMovie", "video"): continue ru_name = line[2] ru_name_writer.append((ru_name, tt_id)) counter += 1 logger.info(f'Обработано ru_name: {counter}') except: logger.exception('Err') pass c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer) conn.commit() def convert_datasets_to_db(): print("Converting tsv dataset to sqlite...") convert_tsv_to_db("title.basics.tsv.gz") print("Unpack ru locale...") extract_ru_locale_from_tsv("title.akas.tsv.gz") def get_title_by_id(films_ids=list): tt_list = [] for i in films_ids: tt_film = c.execute("SELECT * FROM titles WHERE tt_id = (?)", (i,)).fetchone() if tt_film: tt_list.append(tt_film) if tt_list != []: return tt_list def get_title_by_names_and_year(film_names=list): tt_list = [] for i in film_names: tt_film = c.execute('''SELECT * FROM titles WHERE (original_name = (?) OR ru_name = (?)) AND year = (?)''', i).fetchone() if tt_film: tt_list.append(tt_film) if tt_list != []: return tt_list