FMN_bot/src/imdb_datasets_worker.py

112 lines
4.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sqlite3
import gzip
from loguru import logger
conn = sqlite3.connect("imdb_titles.sqlite")
c = conn.cursor()
c.execute(
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
c.execute("PRAGMA synchronous = OFF")
c.execute("PRAGMA optimize")
conn.commit()
def convert_tsv_to_db(title_basics_tsv):
'''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
with gzip.open(title_basics_tsv, mode='rt') as file:
write_dataset = []
counter = 0
chunk = 1000
progress_counter = 0
for line in file:
line = line.split("\t")
try:
tt_id = int(line[0].split("tt")[1])
tt_type = line[1]
original_name = line[3]
ru_name = None
year = line[5]
genres = line[-1].strip().split(',')
if year.startswith(r"\N"):
year = None
else:
year = int(year)
if tt_type not in ("movie", "tvMovie", "video"):
original_name = None
year = None
if "Documentary" in genres:
logger.debug(f'Документальный {original_name} отсеян')
original_name = None
year = None
tt_type = "doc"
write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
counter += 1
if counter >= chunk:
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
write_dataset = []
counter = 0
progress_counter += chunk
logger.info(f'Обработано: {progress_counter}')
except:
logger.exception('Err')
pass
conn.commit()
def extract_ru_locale_from_tsv(title_akas_tsv):
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
with gzip.open(title_akas_tsv, mode='rt') as file:
ru_name_writer = []
counter = 0
for line in file:
line = line.split("\t")
try:
tt_region = line[3]
if tt_region != "RU":
continue
tt_id = int(line[0].split("tt")[1])
tt_type = c.execute("SELECT type FROM titles WHERE tt_id = (?)", (tt_id, )).fetchone()[0]
if tt_type not in ("movie", "tvMovie", "video"):
continue
ru_name = line[2]
ru_name_writer.append((ru_name, tt_id))
counter += 1
logger.info(f'Обработано ru_name: {counter}')
except:
logger.exception('Err')
pass
c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
conn.commit()
def convert_datasets_to_db():
print("Converting tsv dataset to sqlite...")
convert_tsv_to_db("title.basics.tsv.gz")
print("Unpack ru locale...")
extract_ru_locale_from_tsv("title.akas.tsv.gz")
def get_title_by_id(films_ids=list):
tt_list = []
for i in films_ids:
tt_film = c.execute("SELECT * FROM titles WHERE tt_id = (?)", (i,)).fetchone()
if tt_film:
tt_list.append(tt_film)
if tt_list != []:
return tt_list
def get_title_by_names_and_year(film_names=list):
tt_list = []
for i in film_names:
tt_film = c.execute('''SELECT * FROM titles
WHERE (original_name = (?) OR ru_name = (?))
AND year = (?)''', i).fetchone()
if tt_film:
tt_list.append(tt_film)
if tt_list != []:
return tt_list