FMN_bot/src/imdb_datasets_worker.py

112 lines
4.1 KiB
Python
Raw Permalink Normal View History

2022-08-31 10:20:49 +00:00
import sqlite3
import gzip
2022-09-19 11:56:42 +00:00
from loguru import logger
2022-08-31 10:20:49 +00:00
conn = sqlite3.connect("imdb_titles.sqlite")
c = conn.cursor()
c.execute(
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
c.execute("PRAGMA synchronous = OFF")
c.execute("PRAGMA optimize")
conn.commit()
def convert_tsv_to_db(title_basics_tsv):
'''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
2022-08-31 10:20:49 +00:00
with gzip.open(title_basics_tsv, mode='rt') as file:
write_dataset = []
counter = 0
chunk = 1000
progress_counter = 0
2022-08-31 10:20:49 +00:00
for line in file:
line = line.split("\t")
try:
tt_id = int(line[0].split("tt")[1])
tt_type = line[1]
original_name = line[3]
ru_name = None
year = line[5]
genres = line[-1].strip().split(',')
if year.startswith(r"\N"):
2022-08-31 10:20:49 +00:00
year = None
else:
year = int(year)
2022-09-19 11:56:42 +00:00
if tt_type not in ("movie", "tvMovie", "video"):
original_name = None
year = None
if "Documentary" in genres:
logger.debug(f'Документальный {original_name} отсеян')
original_name = None
year = None
tt_type = "doc"
write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
counter += 1
if counter >= chunk:
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
write_dataset = []
counter = 0
progress_counter += chunk
2022-09-19 11:56:42 +00:00
logger.info(f'Обработано: {progress_counter}')
except:
logger.exception('Err')
2022-08-31 10:20:49 +00:00
pass
conn.commit()
2022-08-31 10:20:49 +00:00
def extract_ru_locale_from_tsv(title_akas_tsv):
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
with gzip.open(title_akas_tsv, mode='rt') as file:
ru_name_writer = []
counter = 0
2022-08-31 10:20:49 +00:00
for line in file:
line = line.split("\t")
try:
tt_region = line[3]
if tt_region != "RU":
continue
tt_id = int(line[0].split("tt")[1])
2022-09-19 12:03:49 +00:00
tt_type = c.execute("SELECT type FROM titles WHERE tt_id = (?)", (tt_id, )).fetchone()[0]
2022-09-19 11:56:42 +00:00
if tt_type not in ("movie", "tvMovie", "video"):
2022-08-31 10:20:49 +00:00
continue
ru_name = line[2]
ru_name_writer.append((ru_name, tt_id))
counter += 1
2022-09-19 11:56:42 +00:00
logger.info(f'Обработано ru_name: {counter}')
2022-08-31 10:20:49 +00:00
2022-09-19 11:56:42 +00:00
except:
logger.exception('Err')
2022-08-31 10:20:49 +00:00
pass
c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
2022-08-31 10:20:49 +00:00
conn.commit()
def convert_datasets_to_db():
print("Converting tsv dataset to sqlite...")
convert_tsv_to_db("title.basics.tsv.gz")
print("Unpack ru locale...")
extract_ru_locale_from_tsv("title.akas.tsv.gz")
def get_title_by_id(films_ids=list):
tt_list = []
for i in films_ids:
tt_film = c.execute("SELECT * FROM titles WHERE tt_id = (?)", (i,)).fetchone()
if tt_film:
tt_list.append(tt_film)
2022-09-06 23:10:14 +00:00
if tt_list != []:
return tt_list
2022-08-31 10:20:49 +00:00
def get_title_by_names_and_year(film_names=list):
tt_list = []
for i in film_names:
tt_film = c.execute('''SELECT * FROM titles
WHERE (original_name = (?) OR ru_name = (?))
AND year = (?)''', i).fetchone()
if tt_film:
tt_list.append(tt_film)
if tt_list != []:
return tt_list