Main Quest04 실습-3일차2nd

데이터 분석가:Applied Data Analytics

Main Quest04 실습-3일차2nd

데이터분석 2025. 3. 3. 22:50

320x100

output_df = pd.read_csv("/content/drive/MyDrive/아이펠_오마카세_프로젝트/output.csv")

# 모든 컬럼 표시 옵션 설정 (옵션)

pd.set_option('display.max_columns', None)

# 데이터셋 상위 5개 행 출력

print(output_df.head())

# 각 컬럼별 결측치 개수 확인 총3개의 컬럼에서 결측치 발견

print("결측치 현황:")

print(output_df.isnull().sum())

# 중복 데이터 확인 중복데이터 없음

print("중복 행 수:", output_df.duplicated().sum())

brand_name 컬럼에서 결측치가 가장 많고, sub3_category와 sub4_category도 상당한 수의 결측치가 존재

# brand_name 컬럼의 결측치를 "unknown"으로, 비결측치는 "known"으로 대체

output_df['brand_name'] = output_df['brand_name'].apply(lambda x: 'unknown' if pd.isnull(x) else 'known')

print(output_df['brand_name'].value_counts())

brand_name
known 849325
unknown 632336
Name: count, dtype: int64

# 영문 폰트(기본 sans-serif) 설정

plt.rcParams['font.family'] = 'sans-serif'

plt.rcParams['axes.unicode_minus'] = False

# 브랜드별 개수 계산

brand_counts = output_df['brand_name'].value_counts()

# 그래프 그리기

plt.figure(figsize=(8, 6))

sns.barplot(x=brand_counts.index, y=brand_counts.values, palette='viridis')

plt.title("Brand Count", fontsize=18)

plt.xlabel("Brand", fontsize=14)

plt.ylabel("Count", fontsize=14)

plt.tight_layout()

plt.show()

output_semi4_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal4.csv')

pd.set_option('display.max_columns', None)

output_semi4_df

# 각 컬럼별 결측치 개수 확인 총3개의 컬럼에서 결측치 발견

print("결측치 현황:")

print(output_semi4_df.isnull().sum())

# 중복 데이터 확인 중복데이터 없음

print("중복 행 수:", output_semi4_df.duplicated().sum())

카테고리별 분석작업중 일부

no,main_category,Trend
1,beauty,-0.00015
2,athletic apparel,0.00250
3,tops & blouses,0.00012
4,electronics,-0.00130
5,shoes,-0.00025
6,kids,0.00105
7,home,-0.00005
8,toys,-0.00008
9,jewelry,0.00024
10,vintage & collectibles,0.00117
11,women's handbags,0.00157
12,dresses,-0.00037
13,women's accessories,0.00148
14,jeans,0.00133
15,sweaters,0.00028
16,underwear,0.00060
17,handmade,0.00083
18,sports & outdoors,-0.00045
19,swimwear,-0.00054
20,men's accessories,0.00126
21,coats & jackets,0.00130
22,other,0.00268
23,daily & travel items,-0.00051
24,skirts,-0.00035
25,office supplies,-0.00235
26,pants,0.00112
27,sweats & hoodies,0.00211
28,books,0.00075
29,feeding,0.00102
30,pet supplies,0.00134
31,diapering,-0.00270
32,maternity,-0.00103
33,gear,0.00136
34,automotive,-0.00054
35,suits,0.00412
36,nursery,-0.00031
37,shorts,0.00137

output_semi4_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal4.csv')

pd.set_option('display.max_columns', None)

# main_category별 Trend 수치를 매핑할 딕셔너리 생성

trend_dict = {

'beauty': -0.00015,

'athletic apparel': 0.00250,

'tops & blouses': 0.00012,

'electronics': -0.00130,

'shoes': -0.00025,

'kids': 0.00105,

'home': -0.00005,

'toys': -0.00008,

'jewelry': 0.00024,

'vintage & collectibles': 0.00117,

"women's handbags": 0.00157,

'dresses': -0.00037,

"women's accessories": 0.00148,

'jeans': 0.00133,

'sweaters': 0.00028,

'underwear': 0.00060,

'handmade': 0.00083,

'sports & outdoors': -0.00045,

'swimwear': -0.00054,

"men's accessories": 0.00126,

'coats & jackets': 0.00130,

'other': 0.00268,

'daily & travel items': -0.00051,

'skirts': -0.00035,

'office supplies': -0.00235,

'pants': 0.00112,

'sweats & hoodies': 0.00211,

'books': 0.00075,

'feeding': 0.00102,

'pet supplies': 0.00134,

'diapering': -0.00270,

'maternity': -0.00103,

'gear': 0.00136,

'automotive': -0.00054,

'suits': 0.00412,

'nursery': -0.00031,

'shorts': 0.00137

}

# 기존 데이터프레임 복사 후, Trend 컬럼 추가 (main_category를 기준으로 매핑)

output_semifinal5_df = output_semi4_df.copy()

output_semifinal5_df['Trend'] = output_semifinal5_df['main_category'].map(trend_dict)

# 결과 확인 (상위 5개 행 출력)

print(output_semifinal5_df.head())

import pandas as pd

output_semi5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv')

pd.set_option('display.max_columns', None)

output_semi5_df

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

# CSV 파일 읽기 (파일 경로와 변수명을 맞추어 주세요)

output_semi5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv')

# 분석에 사용할 숫자형 컬럼 목록

numeric_cols = [

'item_condition_id', 'price', 'shipping', 'words_counts',

'positive_count', 'negative_count',

'avg_price_from_new_categoty', 'price_sell_new_categoty', 'Trend'

]

# 해당 컬럼들만 선택하여 상관계수 행렬 계산

corr_matrix = output_semi5_df[numeric_cols].corr()

# 히트맵 시각화

plt.figure(figsize=(10, 8))

sns.heatmap(corr_matrix, annot=True, fmt=".4f", cmap='coolwarm')

plt.title("Selected Numeric Columns Correlation Heatmap")

plt.tight_layout()

plt.show()

눈에 띄는 상관관계 쌍
words_counts ↔ positive_count (약 0.4608) 상품 설명이 길어질수록(단어 수 증가) 긍정적인 단어가 포함될 가능성이 높아지는 경향을 보여줍니다.
이는 설명을 길게 작성하면 자연스럽게 “great”, “excellent” 같은 긍정 단어를 더 많이 쓰는 경우가 있을 수 있음을 의미합니다.
price ↔ price_sell_new_categoty (약 0.4353) 원래 가격이 높은 상품일수록 해당 새 카테고리 기준 가격(price_sell_new_categoty)도 높게 형성되는 경향을 보입니다.
avg_price_from_new_categoty와도 0.4125 정도로 중간 정도의 양의 상관을 보이는데, 이는 카테고리별 평균 가격이 높은 상품군은 실제 판매 가격(price)도 높은 편일 가능성이 있음을 시사합니다.
item_condition_id ↔ positive_count (약 0.1921) 아이템 상태가 좋을수록(또는 높은 condition_id일수록) 상품 설명에 긍정적인 표현이 조금 더 들어갈 가능성이 있습니다.
절댓값이 큰 상관은 아니지만, 제품 상태가 긍정적 서술(“great”, “perfect” 등)과 약간 연동될 수 있음을 암시합니다.
Trend와 가장 높은 상관: negative_count (0.0779), item_condition_id (0.0756) 정도로, 여전히 상관이 매우 낮습니다. 이는 부정적 단어가 조금 많은 상품이 Trend 수치와 아주 약하게나마 양의 관계를 가질 수 있음을 시사하나, 그 영향은 미미합니다.

sns.pairplot(output_semifinal5_df[numeric_cols])

plt.suptitle("Pairplot of Selected Numeric Variables", y=1.02)

plt.show()

1. 전반적인 변수 분포 특징
가격 관련 변수들끼리는 중간 정도의 양의 상관이 존재하며, 설명 단어 수와 긍정 단어 수도 함께 증가하는 경향이 확인.
price, words_counts, positive_count, negative_count 등은 오른쪽 꼬리가 긴(Right-skewed) 분포를 보이는 경향
item_condition_id, shipping 등은 이산형(Discrete) 변수로, 히스토그램에서 구분된 막대나 소수의 고유값 클러스터를 확인가능
Trend가 5년 전 대비 달라진 소비 성향을 반영한다면, 카테고리(main_category, sub1_category 등), 브랜드와의 관계가 더 클 수 있다.
범주형 변수를 활용해 그룹별 평균 Trend를 비교하거나, 필요 시 One-Hot Encoding 후 추가 상관분석/모델링을 고려할 수있다.
회귀분석(다항식 항 추가), 트리 기반 모델(Random Forest, XGBoost 등)을 통해 더 복합적인 상호작용을 탐색해볼 수있다.

# 머신러닝 관련 라이브러리 (별도로 해봄)

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error, r2_score

# 회귀 모델

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

# XGBoost (설치 필요: !pip install xgboost)

import xgboost as xgb

# CSV 파일 읽기 (변수명을 output_semi5_df로 맞춤)

import pandas as pd

output_semi5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv')

# Trend를 예측하는 예시 코드

features = [

'item_condition_id', 'price', 'shipping', 'words_counts',

'positive_count', 'negative_count',

'avg_price_from_new_categoty', 'price_sell_new_categoty'

]

target = 'Trend'

X = output_semi5_df[features]

y = output_semi5_df[target]

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures

# 데이터를 학습용과 테스트용으로 분리 (예: 20% 테스트 데이터)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 다항 피처 변환 (degree=2)

poly = PolynomialFeatures(degree=2, include_bias=False)

X_train_poly = poly.fit_transform(X_train)

X_test_poly = poly.transform(X_test)

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

# CSV 파일 읽기 (파일 경로를 알맞게 수정하세요)

output_semifinal5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv')

# 사용하려는 피처와 타겟

features = [

'item_condition_id', 'price', 'shipping', 'words_counts',

'positive_count', 'negative_count',

'avg_price_from_new_categoty', 'price_sell_new_categoty'

]

target = 'Trend'

# 1) 피처 + 타겟만 추출

df_model = output_semifinal5_df[features + [target]].copy()

# 2) 결측치가 있는 행 제거

df_model.dropna(subset=features + [target], inplace=True)

# 3) X, y 분리

X = df_model[features]

y = df_model[target]

# 4) Train/Test Split (괄호가 올바르게 닫혔는지 확인)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 다항식 변환(Polynomial Features) 및 선형회귀 모델

# 다항식 변환

poly = PolynomialFeatures(degree=2, include_bias=False)

X_train_poly = poly.fit_transform(X_train)

X_test_poly = poly.transform(X_test)

# 선형회귀 모델

lin_reg = LinearRegression()

lin_reg.fit(X_train_poly, y_train)

# 예측

y_pred_lin = lin_reg.predict(X_test_poly)

# 평가

mse_lin = mean_squared_error(y_test, y_pred_lin)

r2_lin = r2_score(y_test, y_pred_lin)

print("=== Polynomial Regression (degree=2) ===")

print("MSE:", mse_lin)

print("R^2:", r2_lin)

=== Polynomial Regression (degree=2) ===
MSE: 1.0686324650030107e-06
R^2: 0.059881012127914746

설명력 부족:
R² 값이 낮다는 것은 모델이 Trend의 변동을 충분히 설명하지 못한다는 것을 의미
현재 사용한 피처들이 Trend에 영향을 주는 요인을 포착하지 못했거나, Trend와의 관계가 더 복잡(비선형, 상호작용 등)할 가능성이 있다.

# Trend 값이 존재하는 행만 남기기

df_trend = output_semifinal5_df.dropna(subset=['Trend'])

# 분석에 사용할 피처와 타겟 설정

features = [

'item_condition_id', 'price', 'shipping', 'words_counts',

'positive_count', 'negative_count',

'avg_price_from_new_categoty', 'price_sell_new_categoty'

]

target = 'Trend'

X = df_trend[features]

y = df_trend[target]

# 데이터 분할 (Train/Test Split)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 다항식 변환 (예: 2차 다항식)

poly = PolynomialFeatures(degree=2, include_bias=False)

X_train_poly = poly.fit_transform(X_train)

X_test_poly = poly.transform(X_test)

# 선형 회귀 모델 훈련

lin_reg = LinearRegression()

lin_reg.fit(X_train_poly, y_train)

# 예측 및 평가

y_pred_lin = lin_reg.predict(X_test_poly)

mse_lin = mean_squared_error(y_test, y_pred_lin)

r2_lin = r2_score(y_test, y_pred_lin)

print("=== Polynomial Regression (degree=2) ===")

print("MSE:", mse_lin)

print("R^2:", r2_lin)

=== Polynomial Regression (degree=2) ===
MSE: 1.0686324650030107e-06
R^2: 0.059881012127914746

R²: 약 0.99965 → 이 모델은 거의 완벽한 예측력을 보여주고 있습니다.
선택한 피처들이 Trend 값을 거의 완벽하게 설명하고 있음을 시사하며,
모델이 데이터의 복잡한 비선형 관계와 상호작용을 효과적으로 포착한 것으로 보임.

Random Forest의 결과는 거의 완벽에 가까워 보이지만, 너무 높은 R² 값이 과적합(overfitting)을 시사할 가능성도 있음.

import xgboost as xgb

# Trend 값이 있는 행만 남기기

df_trend = output_semifinal5_df.dropna(subset=['Trend'])

# 분석에 사용할 피처와 타겟 설정

features = [

'item_condition_id', 'price', 'shipping', 'words_counts',

'positive_count', 'negative_count',

'avg_price_from_new_categoty', 'price_sell_new_categoty'

]

target = 'Trend'

X = df_trend[features]

y = df_trend[target]

# Train/Test Split (예: 80%/20% 분할)

X_train, X_test, y_train, y_test = train_test_split(

X, y, test_size=0.2, random_state=42

)

# ============================================

# Random Forest Regression

# ============================================

rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)

r2_rf = r2_score(y_test, y_pred_rf)

print("=== Random Forest Regression ===")

print("MSE:", mse_rf)

print("R^2:", r2_rf)

# ============================================

# XGBoost Regression

# ============================================

xgbr = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

xgbr.fit(X_train, y_train)

y_pred_xgb = xgbr.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)

r2_xgb = r2_score(y_test, y_pred_xgb)

print("=== XGBoost Regression ===")

print("MSE:", mse_xgb)

print("R^2:", r2_xgb)

=== Random Forest Regression ===
MSE: 3.958341127925316e-10
R^2: 0.9996517688001527
=== XGBoost Regression ===
MSE: 2.7732595183880094e-07
R^2: 0.7560251988483041

# 1. Trend 값이 존재하는 행만 추출

df_trend = output_semi5_df.dropna(subset=['Trend'])

# 2. 상관계수 계산

corr_value = df_trend['Trend'].corr(df_trend['price_sell_new_categoty'])

print("Correlation between Trend and price_sell_new_categoty:", corr_value)

# 3. 시각화 (산점도 + 회귀선)

plt.figure(figsize=(8, 6))

sns.scatterplot(x='price_sell_new_categoty', y='Trend', data=df_trend, alpha=0.3, color='blue', label='Data Points')

sns.regplot(x='price_sell_new_categoty', y='Trend', data=df_trend, scatter=False, color='red', label='Regression Line')

plt.title("Trend vs. price_sell_new_categoty")

plt.xlabel("price_sell_new_categoty")

plt.ylabel("Trend")

plt.legend()

plt.tight_layout()

plt.show()

price_sell_new_categoty 값을 1, 0으로 분류했을때의 상관관계로 뚜렷한 상관관계는 없다.
하여, price_sell_new_categoty 값을 더 세분화 하여 비교해 볼 필요가 있다.

import pandas as pd

# 1. 대상 데이터 로딩 및 필요한 컬럼 선택

output_semi5_df = pd.read_csv(

'/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv'

)

pd.set_option('display.max_columns', None)

# 타깃(price)에 결측치가 없는 데이터만 사용

df = output_semi5_df.dropna(subset=['price']).copy()

# 분석에 사용할 컬럼 목록 (순서대로)

columns_to_use = [

'item_condition_id', # 상품 상태 (숫자형)

'brand_name', # 브랜드 (범주형)

'category_name', # 카테고리 정보 (범주형)

'main_category', # 메인 카테고리 (범주형)

'sub1_category', # 서브 카테고리 1 (범주형)

'sub2_category', # 서브 카테고리 2 (범주형)

'sub3_category', # 서브 카테고리 3 (범주형)

'sub4_category', # 서브 카테고리 4 (범주형)

'new_category', # 신규 카테고리 (범주형)

'shipping', # 배송비 여부 (숫자형; 0, 1)

'avg_price_from_new_categoty', # 카테고리별 평균 가격 (숫자형)

'price_sell_new_categoty', # 카테고리별 판매 가격 (숫자형)

'words_counts', # 상품 설명 길이 (숫자형)

'positive_count', # 긍정 단어 개수 (숫자형)

'negative_count', # 부정 단어 개수 (숫자형)

'Trend' # 가격 변동 정도 (숫자형)

]

target = 'price' # 예측 대상: 실제 판매 가격

# 최종 데이터셋 생성 (특징과 타깃 분리)

df_features = df[columns_to_use].copy()

df_target = df[target].copy()

print("데이터 준비 완료")

print("특징 데이터셋 모양:", df_features.shape)

print("타깃 데이터셋 모양:", df_target.shape)

데이터 준비 완료
특징 데이터셋 모양: (1481661, 16)
타깃 데이터셋 모양: (1481661,)

# 상관관계 분석 (숫자형 변수 중심)

import matplotlib.pyplot as plt

import seaborn as sns

# 숫자형 변수 목록 (범주형 변수는 인코딩 전)

numeric_cols = [

'item_condition_id',

'shipping',

'avg_price_from_new_categoty',

'price_sell_new_categoty',

'words_counts',

'positive_count',

'negative_count',

'Trend'

]

# 숫자형 변수 간의 상관계수 행렬 계산

corr_matrix = df_features[numeric_cols].corr()

# 상관계수 행렬 시각화 (히트맵)

plt.figure(figsize=(10,8))

sns.heatmap(corr_matrix, annot=True, fmt=".4f", cmap='coolwarm')

plt.title("Numeric Feature Correlation Matrix", fontsize=16)

plt.tight_layout()

plt.show()

import pandas as pd

# 예시: 데이터셋 로딩 및 필요한 컬럼 선택 (item_condition_id 만 사용)

output_semi5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv'

)

# 사용할 컬럼: item_condition_id 만 선택

columns_to_use = ['item_condition_id', 'brand_name']

# 타깃 컬럼 price에 결측치가 없는 데이터만 사용

df = output_semi5_df.dropna(subset=['price']).copy()

# item_condition_id 컬럼만 선택하여 features 데이터셋 생성

df_features = df[columns_to_use].copy()

print("데이터 준비 완료. df_features 모양:", df_features.shape)

print(df_features.head())

# 1. 나눔 폰트 설치

!apt-get -qq install fonts-nanum

# 2. 폰트 캐시 갱신

!fc-cache -fv

import matplotlib.pyplot as plt

import matplotlib.font_manager as fm

import seaborn as sns

# 3. 나눔 폰트 등록

font_paths = fm.findSystemFonts(fontpaths=None, fontext='ttf')

for fpath in font_paths:

if 'NanumGothic' in fpath:

fm.fontManager.addfont(fpath)

# 4. matplotlib에 폰트 적용

plt.rcParams['font.family'] = 'NanumGothic'

# 이후 박스플롯 코드 실행

top_brands = df_features['brand_name'].value_counts().head(10).index

df_top = df_features[df_features['brand_name'].isin(top_brands)]

plt.figure(figsize=(12, 6))

sns.boxplot(x='brand_name', y='item_condition_id', data=df_top, palette='viridis')

plt.title("상위 10개 브랜드별 Item Condition 분포", fontsize=16)

plt.xlabel("Brand", fontsize=14)

plt.ylabel("Item Condition ID", fontsize=14)

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()

결론
전반적으로 제품 상태(item_condition_id)가 2~3 사이에 몰려 있어, 가장 흔한 상태는 중간 정도의 사용감이라고 볼 수 있다.
브랜드별로 큰 차이가 보이지 않으나, 일부 브랜드(예: ‘pink’, ‘forever 21’)는 상대적으로 상태가 좋은 편(낮은 item_condition_id)이며, ‘unknown’ 브랜드는 상태 범위가 넓어 편차가 크다.
특정 브랜드 제품은 평균적으로 더 좋은 상태로 판매되는지(또는 상태가 고르지 않은지) 등을 파악할 수 있다.

Trend 트랜드 컬럼과 관련 연관성 찾기작업

output_semi5_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_semifinal5.csv'

)

# 사용할 컬럼: Trend, brand_name

columns_to_use = ['Trend', 'brand_name']

# 타깃 컬럼 price에 결측치가 없는 데이터만 사용

df = output_semi5_df.dropna(subset=['price']).copy()

# 지정한 컬럼만 선택하여 features 데이터셋 생성

df_features = df[columns_to_use].copy()

print("데이터 준비 완료. df_features 모양:", df_features.shape)

print(df_features.head())

# 상위 10개 브랜드 선정 (빈도가 높은 브랜드)

top_brands = df_features['brand_name'].value_counts().head(10).index

df_top = df_features[df_features['brand_name'].isin(top_brands)]

# 박스플롯: 각 브랜드별로 Trend 분포 확인

plt.figure(figsize=(12, 6))

sns.boxplot(x='brand_name', y='Trend', data=df_top, palette='viridis')

plt.title("상위 10개 브랜드별 Trend 분포", fontsize=16)

plt.xlabel("Brand", fontsize=14)

plt.ylabel("Trend", fontsize=14)

plt.xticks(rotation=45, ha='right')

plt.tight_layout()

plt.show()

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

pd.set_option('display.max_columns', None)

output_final_df

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 selected_shipping_percentage 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['selected_shipping_percentage'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='selected_shipping_percentage')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Selected Shipping Percentage\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Selected Shipping Percentage')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 sellability 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['sellability'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='sellability')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Sellability\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Sellability')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 price_vs_avg_price 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['price_vs_avg_price'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='price_vs_avg_price')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Price_vs_Avg_Price\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Price_vs_Avg_Price')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 condition_label_num 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['condition_label_num'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='condition_label_num')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Condition_Label_Num\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Condition_Label_Num')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 words_counts_sum 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['words_counts_sum'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='words_counts_sum')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Words_Counts_Sum\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Words_Counts_Sum')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 shipping_0_percentage 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['shipping_0_percentage'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='shipping_0_percentage')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Shipping_0_Percentage\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Shipping_0_Percentage')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 avg_price_from_new_categoty 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['avg_price_from_new_categoty'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='avg_price_from_new_categoty')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Avg_Price_From_New_Categoty\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Avg_Price_From_New_Categoty')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 negative_count 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['negative_count'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='negative_count')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Negative_Count\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Negative_Count')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 words_counts 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['words_counts'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='words_counts')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Words_Counts\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Words_Counts')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 price 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['price'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='price')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Price\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Price')

# 그래프 출력

plt.show()

# 데이터 불러오기 (파일 경로는 이미 제공된 경로로 수정 가능)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 값이 있는 행만 선택

filtered_df = output_final_df[output_final_df['Trend_100'].notna()]

# Trend_100과 positive_count 컬럼 간의 상관계수 계산

correlation = filtered_df['Trend_100'].corr(filtered_df['positive_count'])

# 산점도 그리기

plt.figure(figsize=(10, 6))

sns.scatterplot(data=filtered_df, x='Trend_100', y='positive_count')

# 상관계수 텍스트 표시

plt.title(f"Scatter Plot of Trend_100 vs Positive_Count\nCorrelation: {correlation:.2f}")

plt.xlabel('Trend_100')

plt.ylabel('Positive_Count')

# 그래프 출력

plt.show()

# 데이터 불러오기

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100을 기준으로 내림차순 정렬하여 상위 10개 선택

top_10_trend = output_final_df.nlargest(10, 'Trend_100')

# Trend_100을 기준으로 오름차순 정렬하여 하위 10개 선택

bottom_10_trend = output_final_df.nsmallest(10, 'Trend_100')

# 상위 10개와 하위 10개의 main_category와 Trend_100 값 출력

top_10_main_category_trend = top_10_trend[['main_category', 'Trend_100']]

bottom_10_main_category_trend = bottom_10_trend[['main_category', 'Trend_100']]

# 결과 출력

print("Top 10 Trend_100 categories with values:")

print(top_10_main_category_trend)

print("\nBottom 10 Trend_100 categories with values:")

print(bottom_10_main_category_trend)

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 고유한 값들을 내림차순으로 정렬

unique_trend_values = output_final_df['Trend_100'].dropna().unique()

sorted_trend_values = sorted(unique_trend_values, reverse=True)

# 상위 10개의 다른 Trend_100 값 추출

top_10_trend_values = sorted_trend_values[:10]

# top_10_trend_values에 해당하는 main_category와 Trend_100 값 추출

top_10_main_category_trend = output_final_df[output_final_df['Trend_100'].isin(top_10_trend_values)][['main_category', 'Trend_100']]

# 결과 출력

print("Top 10 different Trend_100 categories with values (sorted Trend_100):")

print(top_10_main_category_trend)

# 데이터 불러오기

output_final_df = pd.read_csv('/content/drive/MyDrive/아이펠_오마카세_프로젝트/ 최종 데이터셋/output_final.csv')

# Trend_100 컬럼의 고유한 값들을 내림차순으로 정렬

unique_trend_values = output_final_df['Trend_100'].dropna().unique()

sorted_trend_values_desc = sorted(unique_trend_values, reverse=True) # 내림차순

sorted_trend_values_asc = sorted(unique_trend_values) # 오름차순

# 내림차순 상위 10개 값 추출

top_10_trend_values_desc = sorted_trend_values_desc[:10]

# 오름차순 상위 10개 값 추출

top_10_trend_values_asc = sorted_trend_values_asc[:10]

# 내림차순과 오름차순에 해당하는 main_category와 Trend_100 값 추출

top_10_main_category_trend_desc = output_final_df[output_final_df['Trend_100'].isin(top_10_trend_values_desc)][['main_category', 'Trend_100']]

top_10_main_category_trend_asc = output_final_df[output_final_df['Trend_100'].isin(top_10_trend_values_asc)][['main_category', 'Trend_100']]

# 결과 출력

print("Top 10 Trend_100 categories with values (sorted Trend_100 descending):")

print(top_10_main_category_trend_desc)

print("\nTop 10 Trend_100 categories with values (sorted Trend_100 ascending):")

print(top_10_main_category_trend_asc)

데이터로 가치를 만드는 Steven, Follow on LinkedIn

'데이터 분석가:Applied Data Analytics' 카테고리의 다른 글

Zotero란? (1)	2025.03.04
OneTab 설치 (0)	2025.03.03
Obsidian (0)	2025.03.02
커리어 관리툴 분석 (0)	2025.03.01
플랫폼 추천 (경력 정리, 데이터 분석 기록용) (0)	2025.03.01

현재글Main Quest04 실습-3일차2nd

데이터분석가 기술블로그

일정분석 전문가의 데이터분석가 기술블로그입니다.

태블로, 운동, 데이터 시각화, mq06, SQL, 빅분기, 베트남, Ai, 건강, mainquest06, 시각화, 데이터분석가, Tableau, 일정관리, 데이터시각화, ADsP, 파이썬, 데이터분석, 공정관리, 머신러닝,

Today :
Yesterday :

데이터분석가 기술블로그