import sqlite3 import gzip conn = sqlite3.connect("imdb_titles.sqlite") c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''') c.execute("PRAGMA synchronous = OFF") c.execute("PRAGMA optimize") conn.commit() def convert_tsv_to_db(title_basics_tsv): '''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)''' with gzip.open(title_basics_tsv, mode='rt') as file: write_dataset = [] counter = 0 for line in file: line = line.split("\t") try: tt_id = int(line[0].split("tt")[1]) tt_type = line[1] original_name = line[3] ru_name = None year = line[5] if tt_type not in ("movie", "video"): original_name = None year = "\\N" else: print(tt_id, tt_type, original_name, ru_name, year) if year == "\\N": year = None else: year = int(year) write_dataset.append((tt_id, tt_type, original_name, ru_name, year)) counter += 1 if counter >= 1000: c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset) write_dataset = [] except Exception as E: print(E) pass conn.commit() def extract_ru_locale_from_tsv(title_akas_tsv): '''Конвертирование датасета с локализованными названиями и последующее добавление в базу''' with gzip.open(title_akas_tsv, mode='rt') as file: ru_name_writer = [] for line in file: line = line.split("\t") try: tt_region = line[3] if tt_region != "RU": continue tt_id = int(line[0].split("tt")[1]) tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0] if tt_type not in ("movie", "video"): continue ru_name = line[2] print(ru_name, tt_type) ru_name_writer.append((ru_name, tt_id)) except Exception as E: print(E) pass c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer) conn.commit() def convert_datasets_to_db(): print("Converting tsv dataset to sqlite...") convert_tsv_to_db("title.basics.tsv.gz") print("Unpack ru locale...") extract_ru_locale_from_tsv("title.akas.tsv.gz") def get_title_by_id(films_ids=list): tt_list = [] for i in films_ids: tt_film = c.execute(f"SELECT * FROM titles WHERE tt_id={i}").fetchone() tt_list.append(tt_film) print(tt_list) return tt_list