Ícone Ver README do projeto

Transformação nas tabelas para modelagem

%load_ext pretty_jupyter

Importar Bibliotecas

import pandas as pd
import os
import numpy as np
import matplotlib as plt
import seaborn as sns

Definir Diretório

os.chdir("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2")

Carregar Arquivos

ratings_treino_transformado_modelagem = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.1_Datasets_Transformação_parte_1/ratings_treino_transformado_modelagem.pickle", compression='gzip')
ratings_teste_transformado_modelagem = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.1_Datasets_Transformação_parte_1/ratings_teste_transformado_modelagem.pickle", compression='gzip')
ratings_teste_transformado_modelagem.head() 
userId Numero_de_Avaliacoes_por_usuarios movieId Numero_de_Avaliacoes_por_Filme rating timestamp rating_times rating_medio_simples rating_medio_ponderado
0 128 19 168 81 3.0 1998-07-28 13:16:18 0.026268 3.166667 0.851209
1 2311 148 168 81 4.0 2011-07-22 12:26:00 0.375046 3.166667 0.851209
2 2647 128 168 81 5.0 1997-03-31 15:23:45 0.034370 3.166667 0.851209
3 9051 18 168 81 3.0 2005-03-24 17:36:55 0.088577 3.166667 0.851209
4 9595 45 168 81 3.0 1996-05-19 18:24:49 0.017608 3.166667 0.851209
movies_treino_transformado = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/movies_treino_transformado.pickle", compression='gzip')
movies_teste_transformado = pd.read_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.2_Datasets_Transformação_parte_2/movies_teste_transformado.pickle", compression='gzip')
movies_treino_transformado.head()
movieId title genres Ano_do_filme titulo_sem_ano genres_separado
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure
1 2 Jumanji (1995) Adventure|Children|Fantasy 1995 Jumanji Adventure
2 3 Grumpier Old Men (1995) Comedy|Romance 1995 Grumpier Old Men Comedy
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance 1995 Waiting to Exhale Comedy
4 5 Father of the Bride Part II (1995) Comedy 1995 Father of the Bride Part II Comedy

Tabelas de apoio

catalago = movies_treino_transformado[['title', 'genres_separado']]
catalago
title genres_separado
0 Toy Story (1995) Adventure
1 Jumanji (1995) Adventure
2 Grumpier Old Men (1995) Comedy
3 Waiting to Exhale (1995) Comedy
4 Father of the Bride Part II (1995) Comedy
... ... ...
86397 Initial D: Third Stage (2001) Action
86427 Everybody's Oma (2022) Documentary
86435 Insidious: The Red Door (2023) Horror
86439 The Out-Laws (2023) Action
86470 Novalis - Die blaue Blume (1993) Drama

24345 rows × 2 columns

genero_user_treino1 = pd.merge(movies_treino_transformado , ratings_treino_transformado_modelagem, on=['movieId'], how='left')
genero_user_treino1
movieId title genres Ano_do_filme titulo_sem_ano genres_separado userId Numero_de_Avaliacoes_por_usuarios Numero_de_Avaliacoes_por_Filme rating timestamp rating_times rating_medio_simples rating_medio_ponderado
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 144 154 1848 3.5 2014-01-12 03:50:37 0.515954 3.928842 3.264437
1 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 304 498 1848 3.0 2023-06-01 19:24:35 2.453737 3.928842 3.264437
2 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 461 1692 1848 4.5 2020-08-28 20:04:03 2.225713 3.928842 3.264437
3 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 751 469 1848 3.0 2012-09-30 20:05:17 0.349977 3.928842 3.264437
4 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 974 1160 1848 5.0 1999-09-17 11:01:24 0.053903 3.928842 3.264437
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
820503 288557 Initial D: Third Stage (2001) Action|Animation|Romance 2001 Initial D: Third Stage Action 256559 803 2 4.0 2023-06-30 15:36:23 3.319434 4.000000 0.140522
820504 288647 Everybody's Oma (2022) Documentary 2022 Everybody's Oma Documentary 218862 886 1 4.5 2023-07-05 08:02:09 3.743711 4.500000 0.126201
820505 288669 Insidious: The Red Door (2023) Horror|Mystery|Thriller 2023 Insidious: The Red Door Horror 324508 1126 1 2.5 2023-07-15 02:49:04 2.090265 2.500000 0.070112
820506 288679 The Out-Laws (2023) Action|Comedy|Romance 2023 The Out-Laws Action 65065 2569 1 2.5 2023-07-09 01:00:03 2.084003 2.500000 0.070112
820507 288761 Novalis - Die blaue Blume (1993) Drama 1993 Novalis - Die blaue Blume Drama 222466 571 1 4.0 2023-07-11 09:29:45 3.337741 4.000000 0.112179

820508 rows × 14 columns

genero_user_teste1 = pd.merge(movies_teste_transformado , ratings_teste_transformado_modelagem, on=['movieId'], how='left')
genero_user_teste1.head()
movieId title genres Ano_do_filme titulo_sem_ano genres_separado userId Numero_de_Avaliacoes_por_usuarios Numero_de_Avaliacoes_por_Filme rating timestamp rating_times rating_medio_simples rating_medio_ponderado
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 465 98 494 5.0 2017-02-16 04:44:41 1.297498 3.799595 2.333916
1 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 598 124 494 4.0 1997-09-22 16:26:23 0.030011 3.799595 2.333916
2 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 1931 264 494 5.0 2020-04-12 18:15:52 2.306977 3.799595 2.333916
3 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 2311 148 494 4.0 2011-07-23 16:52:35 0.375234 3.799595 2.333916
4 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995 Toy Story Adventure 2647 128 494 3.0 1997-03-31 15:11:04 0.020622 3.799595 2.333916
# Salvar tabela 
catalago.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/catalogo.pickle", compression='gzip')
genero_user_treino1.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/genero_user_treino1.pickle", compression='gzip')
genero_user_teste1.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/genero_user_teste1.pickle", compression='gzip')

Tabela para modelagem: Algoritmo NearestNeighbors

Dados de Treino - (KNN)

# Selecionar as colunas de interesse
movie_knn_treino = movies_treino_transformado[['movieId', 'title', 'genres_separado']]
ratings_knn_treino = ratings_treino_transformado_modelagem[['userId', 'movieId', 'rating_times']]
# Unir as tabelas
movie_ratings_knn_treino = pd.merge(ratings_knn_treino , movie_knn_treino , how="left" , on="movieId" )
movie_ratings_knn_treino
userId movieId rating_times title genres_separado
0 5 47 0.091853 Seven (a.k.a. Se7en) (1995) Mystery
1 5 175 0.073483 Kids (1995) Drama
2 5 257 0.073483 Just Cause (1995) Mystery
3 5 318 0.073483 Shawshank Redemption, The (1994) Crime
4 5 319 0.073483 Shallow Grave (1994) Comedy
... ... ... ... ... ...
820503 330963 53953 0.029359 1408 (2007) Drama
820504 330963 54190 0.176156 Across the Universe (2007) Drama
820505 330963 55069 0.293593 4 Months, 3 Weeks and 2 Days (4 luni, 3 saptam... Drama
820506 330963 55282 0.293593 30 Days of Night (2007) Horror
820507 330963 58293 0.029359 10,000 BC (2008) Adventure

820508 rows × 5 columns

Tabela : Filmes

knn_filmes_treino = movie_ratings_knn_treino.pivot_table(columns='title', index='userId', values="rating_times").fillna(0)
knn_filmes_treino
title (2019) "Great Performances" Cats (1998) #Alive (2020) #Female Pleasure (2018) #Iamhere (2020) #UNFIT: The Psychology of Donald Trump (2019) $ (Dollars) (1971) $5 a Day (2008) $9.99 (2008) $ellebrity (Sellebrity) (2012) ... Üvegtigris (2001) Τέλειοι Ξένοι (2016) Χούλιγκανς: Κάτω τα χέρια απ' τα νιάτα! (1983) Делай - раз! (1989) Каменная башка (2008) Карусель (1970) Он вам не Димон (2017) Пес Барбос и необычный кросс (1961) Я худею (2018) …And the Fifth Horseman Is Fear (1965)
userId
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
49 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
119 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
134 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
330651 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330661 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330811 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330949 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330963 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

7943 rows × 24326 columns

Dados de Teste - KNN

# Selecionar as colunas de interesse
movie_knn_teste = movies_teste_transformado[['movieId', 'title', 'genres_separado']]
ratings_knn_teste = ratings_teste_transformado_modelagem[['userId', 'movieId', 'rating_times']]
# Unir as tabelas
movie_ratings_knn_teste = pd.merge(ratings_knn_teste , movie_knn_teste , how="left" , on="movieId" )
movie_ratings_knn_teste
userId movieId rating_times title genres_separado
0 128 168 0.026308 First Knight (1995) Action
1 128 208 0.008769 Waterworld (1995) Action
2 128 356 0.035077 Forrest Gump (1994) Comedy
3 128 480 0.017539 Jurassic Park (1993) Action
4 128 590 0.017539 Dances with Wolves (1990) Adventure
... ... ... ... ... ...
207989 330948 115210 0.212142 Fury (2014) Action
207990 330948 129779 0.318213 Ghost in the Shell Arise - Border 1: Ghost Pai... Action
207991 330948 130634 0.106071 Furious 7 (2015) Action
207992 330948 132584 0.106071 The Even Stevens Movie (2003) Children
207993 330948 136459 0.318213 Jeff Dunham: Spark of Insanity (2007) Comedy

207994 rows × 5 columns

Tabela: Filmes

knn_filmes_teste = movie_ratings_knn_teste.pivot_table(columns='title', index='userId', values="rating_times").fillna(0)
knn_filmes_teste
title #Alive (2020) $ (Dollars) (1971) '71 (2014) '83 (2021) 'Hellboy': The Seeds of Creation (2004) 'Round Midnight (1986) 'Salem's Lot (2004) 'Til There Was You (1997) 'burbs, The (1989) 'night Mother (1986) ... tick, tick...BOOM! (2021) xXx (2002) xXx: Return of Xander Cage (2017) xXx: State of the Union (2005) ¡Three Amigos! (1986) ¿Quién mató a Bambi? (2013) À nous la liberté (Freedom for Us) (1931) Ánimas (2018) Épouse-moi mon pote (2017) Ужас, который всегда с тобой (2007)
userId
128 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
172 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
465 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
598 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
919 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
330236 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330321 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330496 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
330948 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1986 rows × 15496 columns

Salvar tabelas para KNN

# Salvar tabelas 
knn_filmes_treino.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/knn_filmes_treino.pickle", compression='gzip')
knn_filmes_teste.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/knn_filmes_teste.pickle", compression='gzip')

Tabela para modelagem: Algoritmo FP-Growth

Recomendação por gênero

Função que cria os generos dummies

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

def genero_dummy(df):
    # 1. Explodir os gêneros separados por "|" em múltiplas linhas
    recomendacao_exploded = df.assign(genres=df['genres'].str.split('|')).explode('genres')

    # 2. Criar colunas dummies para cada gênero
    genres_dummies = pd.get_dummies(recomendacao_exploded['genres'])

    # Concatenar os dummies com o DataFrame original
    recomendacao_with_dummies = pd.concat([recomendacao_exploded[['userId', 'movieId']], genres_dummies], axis=1)

    # 3. Agregar as dummies de volta ao formato original
    recomendacao_dummies_aggregated = recomendacao_with_dummies.groupby(['userId', 'movieId']).max().reset_index()

    # 4. Remover a coluna movieId
    recomendacao_final = recomendacao_dummies_aggregated.drop(columns=['movieId'])

    # Garantir que os valores sejam binários (0 ou 1)
    for col in recomendacao_final.columns:
        if col not in ['userId']:  # userId não deve ser convertido
            recomendacao_final[col] = recomendacao_final[col].apply(lambda x: 1 if x > 0 else 0)

    return recomendacao_final 


Dados de treino

# Dados de treino
recomendacao_genero_treino = movies_treino_transformado[['movieId', 'genres']].merge(ratings_treino_transformado_modelagem[['userId','movieId']], on='movieId', how='right')
recomendacao_genero_treino
recomendacao_genero_treino1 = genero_dummy(df=recomendacao_genero_treino)
recomendacao_genero_treino1
userId (no genres listed) Action Adventure Animation Children Comedy Crime Documentary Drama ... Film-Noir Horror IMAX Musical Mystery Romance Sci-Fi Thriller War Western
0 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 1 0 0
1 5 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
2 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 1 0 0
3 5 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
4 5 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
820503 330963 0 0 0 0 0 0 0 0 1 ... 0 1 0 0 0 0 0 1 0 0
820504 330963 0 0 0 0 0 0 0 0 1 ... 0 0 0 1 0 1 0 0 0 0
820505 330963 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
820506 330963 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 1 0 0
820507 330963 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0

820508 rows × 21 columns


Dados de treino

# Dados de teste
recomendacao_genero_teste = movies_teste_transformado[['movieId', 'genres']].merge(ratings_teste_transformado_modelagem[['userId','movieId']], on='movieId', how='right')
recomendacao_genero_teste
movieId genres userId
0 168 Action|Drama|Romance 128
1 168 Action|Drama|Romance 2311
2 168 Action|Drama|Romance 2647
3 168 Action|Drama|Romance 9051
4 168 Action|Drama|Romance 9595
... ... ... ...
207989 162566 (no genres listed) 330667
207990 6572 Comedy|Crime 330948
207991 106542 Action|Comedy|Romance 330948
207992 132584 Children|Comedy 330948
207993 136459 Comedy 330948

207994 rows × 3 columns

recomendacao_genero_teste1 = genero_dummy(df=recomendacao_genero_teste)
recomendacao_genero_teste1
userId (no genres listed) Action Adventure Animation Children Comedy Crime Documentary Drama ... Film-Noir Horror IMAX Musical Mystery Romance Sci-Fi Thriller War Western
0 128 0 1 0 0 0 0 0 0 1 ... 0 0 0 0 0 1 0 0 0 0
1 128 0 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
2 128 0 0 0 0 0 1 0 0 1 ... 0 0 0 0 0 1 0 0 1 0
3 128 0 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 0
4 128 0 0 1 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
207989 330948 0 1 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 1 0
207990 330948 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
207991 330948 0 1 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
207992 330948 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
207993 330948 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

207994 rows × 21 columns

Salvar tabelas para FP-Growth

# Salvar tabelas 
recomendacao_genero_treino1.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/recomendacao_genero_treino1.pickle", compression='gzip')
recomendacao_genero_teste1.to_pickle("C:/0.Projetos/5.Sistema_de_Recomendacao_MovieLens_2/Datasets/3.Datasets_Transformação/3.3_Datasets_Transformação_parte_3/recomendacao_genero_teste1.pickle", compression='gzip')

⚠ Arquivos para a modelagem

  • knn_filmes_treino
  • knn_filmes_teste
  • recomendacao_genero_treino1
  • recomendacao_genero_teste1
  • catalogo



Ícone Ver README do projeto