In [296]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
In [297]:
df_netflix = pd.read_excel("C:/Users/LeeSeungYong/Downloads/netflix_titles.xlsx")
df_netflix_categ = pd.read_excel("C:/Users/LeeSeungYong/Downloads/netflix_titles.xlsx", sheet_name="netflix_titles_category")
In [298]:
df_netflix.head(3)
Out[298]:
duration_minutes | duration_seasons | type | title | date_added | release_year | rating | description | show_id | |
---|---|---|---|---|---|---|---|---|---|
0 | 90 | NaN | Movie | Norm of the North: King Sized Adventure | 2019-09-09 00:00:00 | 2019.0 | TV-PG | Before planning an awesome wedding for his gra... | 81145628.0 |
1 | 94 | NaN | Movie | Jandino: Whatever it Takes | 2016-09-09 00:00:00 | 2016.0 | TV-MA | Jandino Asporaat riffs on the challenges of ra... | 80117401.0 |
2 | NaN | 1 | TV Show | Transformers Prime | 2018-09-08 00:00:00 | 2013.0 | TV-Y7-FV | With the help of three human allies, the Autob... | 70234439.0 |
In [299]:
df_netflix.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6236 entries, 0 to 6235
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 duration_minutes 4267 non-null object
1 duration_seasons 1971 non-null object
2 type 6235 non-null object
3 title 6235 non-null object
4 date_added 6223 non-null object
5 release_year 6234 non-null float64
6 rating 6223 non-null object
7 description 6233 non-null object
8 show_id 6232 non-null float64
dtypes: float64(2), object(7)
memory usage: 438.6+ KB
In [300]:
df_netflix_categ.head(3)
Out[300]:
listed_in | show_id | |
---|---|---|
0 | Children & Family Movies | 81145628 |
1 | Stand-Up Comedy | 80117401 |
2 | Kids' TV | 70234439 |
In [301]:
df_netflix_categ.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13670 entries, 0 to 13669
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 listed_in 13670 non-null object
1 show_id 13670 non-null int64
dtypes: int64(1), object(1)
memory usage: 213.7+ KB
In [302]:
df_join = df_netflix.join(df_netflix_categ, how = "left", lsuffix = "show_id")
In [303]:
df_join = df_join[['listed_in', 'description']]
df_join
Out[303]:
listed_in | description | |
---|---|---|
0 | Children & Family Movies | Before planning an awesome wedding for his gra... |
1 | Stand-Up Comedy | Jandino Asporaat riffs on the challenges of ra... |
2 | Kids' TV | With the help of three human allies, the Autob... |
3 | Kids' TV | When a prison ship crash unleashes hundreds of... |
4 | Comedies | When nerdy high schooler Dani finally attracts... |
... | ... | ... |
6231 | Movies | This parody of first-person shooter games, mil... |
6232 | British TV Shows | Marc Maron stars as Marc Maron, who interviews... |
6233 | Classic & Cult TV | Nursery rhymes and original music for children... |
6234 | Comedies | Set during the Russian Revolution, this comic ... |
6235 | International TV Shows | This hit sitcom follows the merry misadventure... |
6236 rows × 2 columns
In [304]:
print("결측치 처리 전:", df_join.shape)
df_join.dropna(axis = 0, inplace = True)
print("결측치 처리 후:", df_join.shape)
df_join.reset_index(drop = True, inplace = True)
결측치 처리 전: (6236, 2)
결측치 처리 후: (6233, 2)
In [305]:
df_join['description'] = df_join['description'].str.lower()
In [306]:
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
In [307]:
df_join['description'] = df_join['description'].str.replace(pat=r'[^\w]', repl= r' ', regex=True) # replace all special symbols to space
df_join['description'] = df_join['description'].str.replace(pat=r'[0-9]', repl= r' ', regex=True) # replace all special symbols to space
df_join['description'] = df_join['description'].str.replace(pat=r'[\s\s+]', repl= r' ', regex=True) # replace multiple spaces with a single space
In [308]:
df_join['description'][0]
Out[308]:
'before planning an awesome wedding for his grandfather a polar bear king must take back a stolen artifact from an evil archaeologist first '
In [309]:
df_netflix['description'][0]
Out[309]:
'Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.'
In [310]:
import nltk
from nltk.tokenize import word_tokenize
for i in range(df_join.shape[0]):
text=df_join['description'][i]
df_join['description'][i] = word_tokenize(text)
print("{}번째 완료".format(i))
In [311]:
df_join
Out[311]:
listed_in | description | |
---|---|---|
0 | Children & Family Movies | [before, planning, an, awesome, wedding, for, ... |
1 | Stand-Up Comedy | [jandino, asporaat, riffs, on, the, challenges... |
2 | Kids' TV | [with, the, help, of, three, human, allies, th... |
3 | Kids' TV | [when, a, prison, ship, crash, unleashes, hund... |
4 | Comedies | [when, nerdy, high, schooler, dani, finally, a... |
... | ... | ... |
6228 | Movies | [this, parody, of, first, person, shooter, gam... |
6229 | British TV Shows | [marc, maron, stars, as, marc, maron, who, int... |
6230 | Classic & Cult TV | [nursery, rhymes, and, original, music, for, c... |
6231 | Comedies | [set, during, the, russian, revolution, this, ... |
6232 | International TV Shows | [this, hit, sitcom, follows, the, merry, misad... |
6233 rows × 2 columns
In [312]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english') #영어 stopword, 한국어 지원 안됨
print(stop_words[:10])
for i in range(df_join.shape[0]):
result = []
for word in df_join['description'][i]:
if word not in stop_words:
result.append(word)
df_join['description'][i] = result
print("{}번째 완료".format(i))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
0번째 완료
1번째 완료
2번째 완료
3번째 완료
4번째 완료
5번째 완료
6229번째 완료
6230번째 완료
6231번째 완료
6232번째 완료
In [313]:
# !pip install autocorrect
In [314]:
# from autocorrect import Speller # 필요 패키지 로드
# spell = Speller()
# for i in range(df_join.shape[0]):
# for word in df_join['description'][i]:
# spell(word)
# print("{}번째 완료".format(i))
In [315]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for i in range(df_join.shape[0]):
result = []
for word in df_join['description'][i]:
word = lemmatizer.lemmatize(word)
result.append(word)
df_join['description'][i] = result
print("{}번째 완료".format(i))
0번째 완료
1번째 완료
2번째 완료
3번째 완료
4번째 완료
5번째 완료
6229번째 완료
6230번째 완료
6231번째 완료
6232번째 완료
In [316]:
df_join
Out[316]:
listed_in | description | |
---|---|---|
0 | Children & Family Movies | [planning, awesome, wedding, grandfather, pola... |
1 | Stand-Up Comedy | [jandino, asporaat, riff, challenge, raising, ... |
2 | Kids' TV | [help, three, human, ally, autobots, protect, ... |
3 | Kids' TV | [prison, ship, crash, unleashes, hundred, dece... |
4 | Comedies | [nerdy, high, schooler, dani, finally, attract... |
... | ... | ... |
6228 | Movies | [parody, first, person, shooter, game, militar... |
6229 | British TV Shows | [marc, maron, star, marc, maron, interview, fe... |
6230 | Classic & Cult TV | [nursery, rhyme, original, music, child, accom... |
6231 | Comedies | [set, russian, revolution, comic, miniseries, ... |
6232 | International TV Shows | [hit, sitcom, follows, merry, misadventure, si... |
6233 rows × 2 columns
In [317]:
df_word_bycategory = df_join.groupby("listed_in")['description'].sum().reset_index()
In [319]:
df_word_bycategory["description"]
Out[319]:
0 [struggling, couple, believe, luck, find, stas...
1 [anime, visual, album, mysterious, driver, hea...
2 [recently, stationed, air, self, defense, forc...
3 [drawing, newly, available, info, show, trace,...
4 [planning, awesome, wedding, grandfather, pola...
5 [year, woman, return, sleepy, tourist, occupie...
6 [losing, everything, indolent, sad, sack, impu...
7 [nerdy, high, schooler, dani, finally, attract...
8 [young, journalist, forced, life, crime, save,...
9 [man, unsatisfying, marriage, recall, detail, ...
10 [newspaper, industry, take, hit, denver, post,...
11 [california, fire, season, rage, brave, backco...
12 [new, orleans, politician, find, idealistic, p...
13 [young, doctor, arrives, asylum, apprenticeshi...
14 [blind, pianist, fear, life, suspicious, death...
15 [dystopian, future, insurance, adjuster, tech,...
16 [four, couple, different, lifestyle, go, ebb, ...
17 [help, three, human, ally, autobots, protect, ...
18 [emmy, winning, comedy, writer, louis, c, k, b...
19 [final, night, world, tour, director, jonathan...
20 [course, fishing, season, tough, men, woman, p...
21 [combining, trademark, wit, self, deprecating,...
22 [straitjacketed, life, turned, topsy, turvy, h...
23 [mit, astrophysics, professor, son, unearth, s...
24 [world, collide, life, shatter, altagracia, po...
25 [mixing, old, footage, interview, story, arsen...
26 [jandino, asporaat, riff, challenge, raising, ...
27 [comedian, norm, macdonald, host, celebrity, f...
28 [wiped, clean, memory, thrown, together, group...
29 [education, fun, funny, comedian, adam, conove...
30 [trapped, mysterious, fog, resident, maine, vi...
31 [anthology, series, terror, feature, diverse, ...
32 [manipulated, embezzling, fund, boyfriend, sen...
33 [explore, hit, song, smith, film, bright, musi...
34 [tailspin, marriage, collapse, jamie, fall, he...
Name: description, dtype: object
In [320]:
from collections import Counter
for i in range(df_word_bycategory.shape[0]):
count_dict = dict(Counter(df_word_bycategory["description"][i]).most_common()[:100])
df_word_bycategory["description"][i] = count_dict
print("{}번째 완료".format(i))
0번째 완료
1번째 완료
2번째 완료
3번째 완료
4번째 완료
5번째 완료
6번째 완료
7번째 완료
8번째 완료
9번째 완료
10번째 완료
11번째 완료
12번째 완료
13번째 완료
14번째 완료
15번째 완료
16번째 완료
17번째 완료
18번째 완료
19번째 완료
20번째 완료
21번째 완료
22번째 완료
23번째 완료
24번째 완료
25번째 완료
26번째 완료
27번째 완료
28번째 완료
29번째 완료
30번째 완료
31번째 완료
32번째 완료
33번째 완료
34번째 완료
In [323]:
print(df_word_bycategory['description'][0])
{'life': 75, 'find': 61, 'take': 55, 'young': 55, 'man': 45, 'world': 40, 'woman': 40, 'must': 38, 'two': 35, 'family': 35, 'one': 33, 'friend': 32, 'new': 30, 'love': 29, 'father': 27, 'year': 27, 'brother': 25, 'help': 23, 'team': 23, 'save': 22, 'cop': 22, 'city': 21, 'crime': 21, 'death': 21, 'agent': 20, 'mission': 20, 'girl': 20, 'battle': 20, 'group': 20, 'three': 20, 'get': 20, 'make': 19, 'old': 19, 'fight': 19, 'try': 19, 'murder': 18, 'drug': 18, 'come': 18, 'way': 18, 'star': 18, 'war': 17, 'home': 17, 'high': 17, 'go': 17, 'wife': 17, 'back': 17, 'son': 17, 'turn': 17, 'lead': 17, 'terrorist': 16, 'run': 16, 'return': 16, 'power': 16, 'school': 16, 'true': 16, 'police': 16, 'four': 16, 'force': 16, 'country': 15, 'special': 15, 'time': 15, 'officer': 15, 'art': 14, 'deadly': 14, 'criminal': 14, 'revenge': 14, 'evil': 14, 'first': 14, 'mysterious': 14, 'martial': 13, 'boy': 13, 'daughter': 13, 'face': 13, 'ex': 13, 'secret': 13, 'become': 13, 'film': 13, 'join': 13, 'work': 13, 'american': 13, 'day': 13, 'protect': 12, 'forced': 12, 'village': 12, 'prison': 12, 'struggle': 12, 'dangerous': 12, 'story': 12, 'becomes': 12, 'gang': 12, 'action': 12, 'set': 12, 'bos': 12, 'behind': 12, 'teen': 11, 'soon': 11, 'u': 11, 'crew': 11, 'based': 11, 'student': 11}
In [372]:
df_final = pd.DataFrame()
for i, c in enumerate(df_word_bycategory['listed_in']):
print(i, c)
df_word = pd.Series(df_word_bycategory['description'][i]).reset_index()
df_word["category"] = c
df_final = df_final.append(df_word, ignore_index = True)
print("{}번째 완료".format(i))
0 Action & Adventure
0번째 완료
1 Anime Features
1번째 완료
2 Anime Series
2번째 완료
3 British TV Shows
3번째 완료
4 Children & Family Movies
4번째 완료
5 Classic & Cult TV
5번째 완료
6 Classic Movies
6번째 완료
7 Comedies
7번째 완료
8 Crime TV Shows
8번째 완료
9 Cult Movies
9번째 완료
10 Documentaries
10번째 완료
11 Docuseries
11번째 완료
12 Dramas
12번째 완료
13 Horror Movies
13번째 완료
14 Independent Movies
14번째 완료
15 International Movies
15번째 완료
16 International TV Shows
16번째 완료
17 Kids' TV
17번째 완료
18 Movies
18번째 완료
19 Music & Musicals
19번째 완료
20 Reality TV
20번째 완료
21 Romantic Movies
21번째 완료
22 Romantic TV Shows
22번째 완료
23 Sci-Fi & Fantasy
23번째 완료
24 Spanish-Language TV Shows
24번째 완료
25 Sports Movies
25번째 완료
26 Stand-Up Comedy
26번째 완료
27 Stand-Up Comedy & Talk Shows
27번째 완료
28 TV Action & Adventure
28번째 완료
29 TV Comedies
29번째 완료
30 TV Dramas
30번째 완료
31 TV Horror
31번째 완료
32 TV Sci-Fi & Fantasy
32번째 완료
33 TV Shows
33번째 완료
34 Thrillers
34번째 완료
In [374]:
df_final = df_final[["category", "index", 0]]
df_final
Out[374]:
category | index | 0 | |
---|---|---|---|
0 | Action & Adventure | life | 75 |
1 | Action & Adventure | find | 61 |
2 | Action & Adventure | take | 55 |
3 | Action & Adventure | young | 55 |
4 | Action & Adventure | man | 45 |
... | ... | ... | ... |
3169 | Thrillers | spy | 1 |
3170 | Thrillers | million | 1 |
3171 | Thrillers | haul | 1 |
3172 | Thrillers | selling | 1 |
3173 | Thrillers | held | 1 |
3174 rows × 3 columns
In [376]:
df_final.columns = ["category", "word", "frequency"]
df_final.to_excel("word_count.xlsx", index = False)
'Tableau' 카테고리의 다른 글
[Tableau] How Much Americans Spend on Holiday Every Year? (0) | 2023.01.11 |
---|---|
[Tableau] UK Religious Affiliation Dashboard (0) | 2023.01.05 |
[Tableau] Netflix Dashboard (0) | 2023.01.05 |
[Tableau] HR Employee Termination Dashboard (0) | 2022.12.27 |
[Tableau] HR Employment Equity Dashboard (0) | 2022.12.27 |