Transformação nas tabelas movies e genome

%load_ext pretty_jupyter

Importar Bibliotecas¶

import pandas as pd
import os
import numpy as np

import psutil # CHECAR MEMORIA DO PC

# Usada na função para descobrir se é possível converter object em category
from pandas.api.types import CategoricalDtype

import re

import matplotlib as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

Definir Diretório¶

os.chdir("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2")

Carregar Arquivos¶

# Carregar tabelas movies
movies_treino = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/movies_treino.pickle", compression="gzip") 
movies_teste = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/movies_teste.pickle", compression="gzip")

movies_treino

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy
...	...	...	...
86397	288557	Initial D: Third Stage (2001)	Action\|Animation\|Romance
86427	288647	Everybody's Oma (2022)	Documentary
86435	288669	Insidious: The Red Door (2023)	Horror\|Mystery\|Thriller
86439	288679	The Out-Laws (2023)	Action\|Comedy\|Romance
86470	288761	Novalis - Die blaue Blume (1993)	Drama

24345 rows × 3 columns

# Carregar tabelas genomes_tags
genome_tags_treino= pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/genome_tags_treino.pickle", compression="gzip")
genome_tags_teste = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/genome_tags_teste.pickle", compression="gzip")

genome_tags_treino

	tagId	tag
0	1	007
1	2	007 (series)
2	3	18th century
3	4	1920s
4	5	1930s
...	...	...
1123	1124	writing
1124	1125	wuxia
1125	1126	wwii
1126	1127	zombie
1127	1128	zombies

961 rows × 2 columns

# Carregar as tabelas genome-scores
genome_scores_treino = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/genome_scores_treino.pickle", compression="gzip")
genome_scores_teste = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/2.Datasets_Limpeza/genome_scores_teste.pickle", compression="gzip")

genome_scores_treino

	movieId	tagId	relevance
0	1	1	0.03200
1	1	2	0.02225
2	1	3	0.07000
3	1	4	0.05900
4	1	5	0.12300
...	...	...	...
18472123	288167	1124	0.09875
18472124	288167	1125	0.02950
18472125	288167	1126	0.02275
18472126	288167	1127	0.11225
18472127	288167	1128	0.03025

15737336 rows × 3 columns

Classes¶

Classe 1: Extrair o ano do título¶

class ExtrairAno(BaseEstimator, TransformerMixin):
    ''' 
    Classe que extrai o ano do título do filme e cria uma nova coluna com título sem o ano 
    
    Args:
    - coluna: nome da coluna que contém o título com o ano do filme
    '''
    def __init__(self, coluna):
        self.coluna = coluna

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tabela = X.copy()
        nova_coluna_nome = f"Ano_do_filme"
        # Use a expressão regular para extrair o ano entre parênteses
        tabela[nova_coluna_nome] = tabela[self.coluna].str.extract(r'\((\d{4})\)', expand=False)
        # Se houver valores NaN (caso em que a extração falhou), preencher com -1 
        tabela[nova_coluna_nome] = tabela[nova_coluna_nome].fillna(-1).astype(int)
        return tabela

Classe 2: Criar coluna de título sem o ano¶

class ExtrairTitulo(BaseEstimator, TransformerMixin):
    def __init__(self, coluna):
        self.coluna = coluna

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tabela = X.copy()
        tabela['titulo_sem_ano'] = tabela[self.coluna].str.extract(r'^(.*?)\s*\(\d{4}\)')
        # Se houver valores NaN (caso em que a extração falhou), preencher com -1 
        tabela['titulo_sem_ano'] = tabela['titulo_sem_ano'].fillna(-1)
        return tabela

Classe 3: Criar coluna com só um gênero por filme¶

class GeneroSeparado(BaseEstimator, TransformerMixin):
    ''' Classe que cria uma coluna com o primeiro genero do filme
    Args:
    - coluna: nome da coluna com os generos dos filmes
    '''
    def __init__(self, coluna):
        self.coluna = coluna

    def fit(self,X,y=None):
        return self

    def transform(self, X):
        tabela = X.copy()
        tabela[f"{self.coluna}_separado"] = tabela[self.coluna].apply(lambda x: x.split('|')[0])
        return tabela

Transformação nas tabelas¶

Pipeline da tabela movies¶

# Criar a pipeline
pipeline_movies = Pipeline([
    ('Extrair o ano do titulo', ExtrairAno(coluna = 'title')),
    ('Criar coluna de título sem o ano', ExtrairTitulo(coluna='title')),
    ('Criar coluna com só um gênero por filme', GeneroSeparado(coluna='genres')) 
])

pipeline_movies

Pipeline(steps=[('Extrair o ano do titulo', ExtrairAno(coluna='title')),
                ('Criar coluna de título sem o ano',
                 ExtrairTitulo(coluna='title')),
                ('Criar coluna com só um gênero por filme',
                 GeneroSeparado(coluna='genres'))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Executar a pipeline
movies_treino_transformado = pipeline_movies.fit_transform(movies_treino)
movies_teste_transformado = pipeline_movies.fit_transform(movies_teste)

movies_treino_transformado

	movieId	title	genres	Ano_do_filme	titulo_sem_ano	genres_separado
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	1995	Toy Story	Adventure
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy	1995	Jumanji	Adventure
2	3	Grumpier Old Men (1995)	Comedy\|Romance	1995	Grumpier Old Men	Comedy
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance	1995	Waiting to Exhale	Comedy
4	5	Father of the Bride Part II (1995)	Comedy	1995	Father of the Bride Part II	Comedy
...	...	...	...	...	...	...
86397	288557	Initial D: Third Stage (2001)	Action\|Animation\|Romance	2001	Initial D: Third Stage	Action
86427	288647	Everybody's Oma (2022)	Documentary	2022	Everybody's Oma	Documentary
86435	288669	Insidious: The Red Door (2023)	Horror\|Mystery\|Thriller	2023	Insidious: The Red Door	Horror
86439	288679	The Out-Laws (2023)	Action\|Comedy\|Romance	2023	The Out-Laws	Action
86470	288761	Novalis - Die blaue Blume (1993)	Drama	1993	Novalis - Die blaue Blume	Drama

24345 rows × 6 columns

# Verificar se existem NA´s
movies_treino_transformado.isna().sum()

movieId            0
title              0
genres             0
Ano_do_filme       0
titulo_sem_ano     0
genres_separado    0
dtype: int64

Salvar tabelas movies¶

# Salvar tabelas
movies_treino_transformado.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/movies_treino_transformado.pickle", compression = 'gzip')
movies_teste_transformado.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/movies_teste_transformado.pickle", compression = 'gzip')

Unir as tabelas genome_tags e genome_scores¶

genome_treino = pd.merge(genome_scores_treino, genome_tags_treino, on="tagId", how="inner")

genome_treino

	movieId	tagId	relevance	tag
0	1	1	0.03200	007
1	1	2	0.02225	007 (series)
2	1	3	0.07000	18th century
3	1	4	0.05900	1920s
4	1	5	0.12300	1930s
...	...	...	...	...
15737331	288167	1124	0.09875	writing
15737332	288167	1125	0.02950	wuxia
15737333	288167	1126	0.02275	wwii
15737334	288167	1127	0.11225	zombie
15737335	288167	1128	0.03025	zombies

15737336 rows × 4 columns

genome_teste= pd.merge(genome_scores_teste, genome_tags_teste, on="tagId", how="inner")

genome_teste

	movieId	tagId	relevance	tag
0	1	1	0.03200	007
1	1	6	0.13100	1950s
2	1	7	0.06175	1960s
3	1	8	0.19550	1970s
4	1	9	0.26625	1980s
...	...	...	...	...
10840907	288167	1116	0.37600	women
10840908	288167	1117	0.01825	working class
10840909	288167	1118	0.08100	workplace
10840910	288167	1119	0.08075	world politics
10840911	288167	1128	0.03025	zombies

10840912 rows × 4 columns

Salvar tabelas genome¶

# Salvar tabelas
genome_treino.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/genome_treino.pickle", compression='gzip')
genome_teste.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendação_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/genome_teste.pickle", compression='gzip')

⚠ Arquivos para as próximas etapas¶

movies_treino_transformado

movies_treino_transformado

genome_treino

genome_teste

Ver README do projeto

Sistema de Recomendação MovieLens

Transformação dos dados: Parte 2

Catarina Aguiar