Hotel Booking Demand 完整分析流程(從讀取資料開始)

Hotel Booking Demand 完整分析流程(從讀取資料開始)
Photo by Tim Oun / Unsplash
hotel_bookings.csv

如果檔名不同,把程式裡的檔名改掉就可以。


1. 載入套件

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

2. 讀取 dataset

# 讀取資料
df = pd.read_csv("hotel_bookings.csv")

# 顯示前 5 筆
print(df.head())

3. 查看資料基本資訊

# 資料維度(列數, 欄數)
print("資料維度:", df.shape)

# 欄位資訊
print(df.info())

# 數值欄位統計
print(df.describe())

# 類別欄位統計
print(df.describe(include="object"))

4. 檢查缺失值

# 檢查每個欄位缺失值數量
missing_count = df.isnull().sum()

# 檢查缺失比例
missing_ratio = df.isnull().sum() / len(df) * 100

missing_table = pd.DataFrame({
    "缺失數量": missing_count,
    "缺失比例(%)": missing_ratio
}).sort_values(by="缺失數量", ascending=False)

print(missing_table[missing_table["缺失數量"] > 0])

5. 資料前處理

5.1 日期欄位轉換

df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"], errors="coerce")

5.2 處理缺失值

# children 缺失補 0
df["children"] = df["children"].fillna(0)

# country 缺失補 Unknown
df["country"] = df["country"].fillna("Unknown")

# agent 缺失補 0
df["agent"] = df["agent"].fillna(0)

5.3 刪除缺失過多欄位

# company 缺失值過多,直接刪除
df.drop(columns=["company"], inplace=True)

5.4 建立總入住人數欄位

df["total_guests"] = df["adults"] + df["children"] + df["babies"]
print("總入住人數為 0 的筆數:", (df["total_guests"] == 0).sum())

5.5 刪除異常資料

# 刪除總入住人數為 0 的資料
df = df[df["total_guests"] > 0]

# 刪除 adr < 0 的異常值
df = df[df["adr"] >= 0]

print("清理後資料維度:", df.shape)

6. 特徵工程

6.1 總住宿夜數

df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]

6.2 是否更換房型

df["room_changed"] = (df["reserved_room_type"] != df["assigned_room_type"]).astype(int)

6.3 月份轉數字

month_map = {
    "January": 1, "February": 2, "March": 3, "April": 4,
    "May": 5, "June": 6, "July": 7, "August": 8,
    "September": 9, "October": 10, "November": 11, "December": 12
}

df["arrival_month_num"] = df["arrival_date_month"].map(month_map)

6.4 平均每位旅客房價

df["adr_per_guest"] = df["adr"] / df["total_guests"]

6.5 是否為家庭旅客

df["is_family"] = ((df["children"] > 0) | (df["babies"] > 0)).astype(int)

7. 探索性資料分析(EDA)

7.1 整體取消率

cancel_rate = df["is_canceled"].mean()
print("整體取消率:{:.2%}".format(cancel_rate))

7.2 不同飯店類型的取消率

cancel_by_hotel = df.groupby("hotel")["is_canceled"].mean()
print(cancel_by_hotel)

7.3 不同訂金類型的取消率

cancel_by_deposit = df.groupby("deposit_type")["is_canceled"].mean()
print(cancel_by_deposit)

7.4 不同客戶類型的取消率

cancel_by_customer = df.groupby("customer_type")["is_canceled"].mean()
print(cancel_by_customer)

7.5 不同市場區隔的取消率

cancel_by_segment = df.groupby("market_segment")["is_canceled"].mean().sort_values(ascending=False)
print(cancel_by_segment)

8. 視覺化分析

8.1 飯店類型分布

plt.figure(figsize=(8, 5))
df["hotel"].value_counts().plot(kind="bar")
plt.title("飯店類型分布")
plt.xlabel("飯店類型")
plt.ylabel("筆數")
plt.xticks(rotation=0)
plt.show()

8.2 取消與未取消筆數

plt.figure(figsize=(6, 5))
df["is_canceled"].value_counts().sort_index().plot(kind="bar")
plt.title("取消與未取消訂單數量")
plt.xlabel("是否取消 (0=未取消, 1=取消)")
plt.ylabel("筆數")
plt.xticks(rotation=0)
plt.show()

8.3 不同飯店類型取消率

plt.figure(figsize=(8, 5))
cancel_by_hotel.plot(kind="bar")
plt.title("不同飯店類型取消率")
plt.xlabel("飯店類型")
plt.ylabel("取消率")
plt.xticks(rotation=0)
plt.show()

8.4 不同訂金類型取消率

plt.figure(figsize=(8, 5))
cancel_by_deposit.plot(kind="bar")
plt.title("不同訂金類型取消率")
plt.xlabel("訂金類型")
plt.ylabel("取消率")
plt.xticks(rotation=0)
plt.show()

8.5 訂房提前天數分布

plt.figure(figsize=(8, 5))
plt.hist(df["lead_time"], bins=50)
plt.title("訂房提前天數分布")
plt.xlabel("lead_time")
plt.ylabel("筆數")
plt.show()

8.6 ADR 分布

plt.figure(figsize=(8, 5))
plt.hist(df["adr"], bins=50)
plt.title("平均每日房價(ADR)分布")
plt.xlabel("adr")
plt.ylabel("筆數")
plt.show()

8.7 各月份取消率

monthly_cancel = df.groupby("arrival_month_num")["is_canceled"].mean()

plt.figure(figsize=(10, 5))
monthly_cancel.plot(marker="o")
plt.title("各月份取消率")
plt.xlabel("月份")
plt.ylabel("取消率")
plt.xticks(range(1, 13))
plt.show()

8.8 前 10 名客戶來源國

plt.figure(figsize=(10, 5))
df["country"].value_counts().head(10).plot(kind="bar")
plt.title("前 10 名客戶來源國")
plt.xlabel("國家")
plt.ylabel("筆數")
plt.xticks(rotation=45)
plt.show()

8.9 特殊需求數量與取消率關係

special_req_cancel = df.groupby("total_of_special_requests")["is_canceled"].mean()

plt.figure(figsize=(8, 5))
special_req_cancel.plot(marker="o")
plt.title("特殊需求數量與取消率關係")
plt.xlabel("特殊需求數")
plt.ylabel("取消率")
plt.show()

9. 相關性分析

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
corr_matrix = df[numeric_cols].corr()

# 查看與 is_canceled 最相關的數值欄位
target_corr = corr_matrix["is_canceled"].sort_values(ascending=False)
print(target_corr)

10. 建立預測模型

10.1 定義特徵與目標欄位

注意:
reservation_statusreservation_status_date 容易造成 資料洩漏,所以要刪除。

X = df.drop(columns=["is_canceled", "reservation_status", "reservation_status_date"])
y = df["is_canceled"]

10.2 區分數值欄位與類別欄位

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("數值欄位:", numeric_features)
print("類別欄位:", categorical_features)

10.3 建立前處理流程

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

10.4 切分訓練集與測試集

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

10.5 建立模型

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

10.6 訓練模型

clf.fit(X_train, y_train)

10.7 進行預測

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

11. 模型評估

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_prob))

Classification Report

print(classification_report(y_test, y_pred))

Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

Confusion Matrix 視覺化

plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix")
plt.colorbar()
plt.xticks([0, 1], ["未取消", "已取消"])
plt.yticks([0, 1], ["未取消", "已取消"])
plt.xlabel("預測值")
plt.ylabel("實際值")

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")

plt.tight_layout()
plt.show()

12. 特徵重要性分析

# 取得 One-Hot Encoding 後的類別欄位名稱
ohe = clf.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)

# 合併全部欄位名稱
all_feature_names = np.concatenate([numeric_features, cat_feature_names])

# 取得特徵重要性
importances = clf.named_steps["model"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feature_importance_df.head(20))

前 20 個重要特徵視覺化

top20 = feature_importance_df.head(20).sort_values(by="importance")

plt.figure(figsize=(10, 8))
plt.barh(top20["feature"], top20["importance"])
plt.title("前 20 個重要特徵")
plt.xlabel("importance")
plt.ylabel("feature")
plt.show()

13. 分析結論範例

你可以把這段直接寫進報告:

print("分析結論:")
print("1. 此資料集可用來分析旅客訂房行為與取消模式。")
print("2. 整體取消率可反映飯店訂單穩定程度。")
print("3. 不同飯店類型、訂金類型、客戶類型與市場來源的取消率存在差異。")
print("4. lead_time、deposit_type、market_segment、adr、special_requests 等變數通常對取消預測較重要。")
print("5. 使用機器學習模型可有效預測高風險取消訂單,協助飯店優化訂房策略。")