Hotel Booking Demand 完整分析流程(從讀取資料開始)
hotel_bookings.csv
如果檔名不同,把程式裡的檔名改掉就可以。
1. 載入套件
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score, classification_report, confusion_matrix
)
2. 讀取 dataset
# 讀取資料
df = pd.read_csv("hotel_bookings.csv")
# 顯示前 5 筆
print(df.head())
3. 查看資料基本資訊
# 資料維度(列數, 欄數)
print("資料維度:", df.shape)
# 欄位資訊
print(df.info())
# 數值欄位統計
print(df.describe())
# 類別欄位統計
print(df.describe(include="object"))
4. 檢查缺失值
# 檢查每個欄位缺失值數量
missing_count = df.isnull().sum()
# 檢查缺失比例
missing_ratio = df.isnull().sum() / len(df) * 100
missing_table = pd.DataFrame({
"缺失數量": missing_count,
"缺失比例(%)": missing_ratio
}).sort_values(by="缺失數量", ascending=False)
print(missing_table[missing_table["缺失數量"] > 0])
5. 資料前處理
5.1 日期欄位轉換
df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"], errors="coerce")
5.2 處理缺失值
# children 缺失補 0
df["children"] = df["children"].fillna(0)
# country 缺失補 Unknown
df["country"] = df["country"].fillna("Unknown")
# agent 缺失補 0
df["agent"] = df["agent"].fillna(0)
5.3 刪除缺失過多欄位
# company 缺失值過多,直接刪除
df.drop(columns=["company"], inplace=True)
5.4 建立總入住人數欄位
df["total_guests"] = df["adults"] + df["children"] + df["babies"]
print("總入住人數為 0 的筆數:", (df["total_guests"] == 0).sum())
5.5 刪除異常資料
# 刪除總入住人數為 0 的資料
df = df[df["total_guests"] > 0]
# 刪除 adr < 0 的異常值
df = df[df["adr"] >= 0]
print("清理後資料維度:", df.shape)
6. 特徵工程
6.1 總住宿夜數
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
6.2 是否更換房型
df["room_changed"] = (df["reserved_room_type"] != df["assigned_room_type"]).astype(int)
6.3 月份轉數字
month_map = {
"January": 1, "February": 2, "March": 3, "April": 4,
"May": 5, "June": 6, "July": 7, "August": 8,
"September": 9, "October": 10, "November": 11, "December": 12
}
df["arrival_month_num"] = df["arrival_date_month"].map(month_map)
6.4 平均每位旅客房價
df["adr_per_guest"] = df["adr"] / df["total_guests"]
6.5 是否為家庭旅客
df["is_family"] = ((df["children"] > 0) | (df["babies"] > 0)).astype(int)
7. 探索性資料分析(EDA)
7.1 整體取消率
cancel_rate = df["is_canceled"].mean()
print("整體取消率:{:.2%}".format(cancel_rate))
7.2 不同飯店類型的取消率
cancel_by_hotel = df.groupby("hotel")["is_canceled"].mean()
print(cancel_by_hotel)
7.3 不同訂金類型的取消率
cancel_by_deposit = df.groupby("deposit_type")["is_canceled"].mean()
print(cancel_by_deposit)
7.4 不同客戶類型的取消率
cancel_by_customer = df.groupby("customer_type")["is_canceled"].mean()
print(cancel_by_customer)
7.5 不同市場區隔的取消率
cancel_by_segment = df.groupby("market_segment")["is_canceled"].mean().sort_values(ascending=False)
print(cancel_by_segment)
8. 視覺化分析
8.1 飯店類型分布
plt.figure(figsize=(8, 5))
df["hotel"].value_counts().plot(kind="bar")
plt.title("飯店類型分布")
plt.xlabel("飯店類型")
plt.ylabel("筆數")
plt.xticks(rotation=0)
plt.show()
8.2 取消與未取消筆數
plt.figure(figsize=(6, 5))
df["is_canceled"].value_counts().sort_index().plot(kind="bar")
plt.title("取消與未取消訂單數量")
plt.xlabel("是否取消 (0=未取消, 1=取消)")
plt.ylabel("筆數")
plt.xticks(rotation=0)
plt.show()
8.3 不同飯店類型取消率
plt.figure(figsize=(8, 5))
cancel_by_hotel.plot(kind="bar")
plt.title("不同飯店類型取消率")
plt.xlabel("飯店類型")
plt.ylabel("取消率")
plt.xticks(rotation=0)
plt.show()
8.4 不同訂金類型取消率
plt.figure(figsize=(8, 5))
cancel_by_deposit.plot(kind="bar")
plt.title("不同訂金類型取消率")
plt.xlabel("訂金類型")
plt.ylabel("取消率")
plt.xticks(rotation=0)
plt.show()
8.5 訂房提前天數分布
plt.figure(figsize=(8, 5))
plt.hist(df["lead_time"], bins=50)
plt.title("訂房提前天數分布")
plt.xlabel("lead_time")
plt.ylabel("筆數")
plt.show()
8.6 ADR 分布
plt.figure(figsize=(8, 5))
plt.hist(df["adr"], bins=50)
plt.title("平均每日房價(ADR)分布")
plt.xlabel("adr")
plt.ylabel("筆數")
plt.show()
8.7 各月份取消率
monthly_cancel = df.groupby("arrival_month_num")["is_canceled"].mean()
plt.figure(figsize=(10, 5))
monthly_cancel.plot(marker="o")
plt.title("各月份取消率")
plt.xlabel("月份")
plt.ylabel("取消率")
plt.xticks(range(1, 13))
plt.show()
8.8 前 10 名客戶來源國
plt.figure(figsize=(10, 5))
df["country"].value_counts().head(10).plot(kind="bar")
plt.title("前 10 名客戶來源國")
plt.xlabel("國家")
plt.ylabel("筆數")
plt.xticks(rotation=45)
plt.show()
8.9 特殊需求數量與取消率關係
special_req_cancel = df.groupby("total_of_special_requests")["is_canceled"].mean()
plt.figure(figsize=(8, 5))
special_req_cancel.plot(marker="o")
plt.title("特殊需求數量與取消率關係")
plt.xlabel("特殊需求數")
plt.ylabel("取消率")
plt.show()
9. 相關性分析
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
corr_matrix = df[numeric_cols].corr()
# 查看與 is_canceled 最相關的數值欄位
target_corr = corr_matrix["is_canceled"].sort_values(ascending=False)
print(target_corr)
10. 建立預測模型
10.1 定義特徵與目標欄位
注意:reservation_status 和 reservation_status_date 容易造成 資料洩漏,所以要刪除。
X = df.drop(columns=["is_canceled", "reservation_status", "reservation_status_date"])
y = df["is_canceled"]
10.2 區分數值欄位與類別欄位
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
print("數值欄位:", numeric_features)
print("類別欄位:", categorical_features)
10.3 建立前處理流程
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
]
)
10.4 切分訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
10.5 建立模型
model = RandomForestClassifier(
n_estimators=200,
random_state=42,
n_jobs=-1,
class_weight="balanced"
)
clf = Pipeline(steps=[
("preprocessor", preprocessor),
("model", model)
])
10.6 訓練模型
clf.fit(X_train, y_train)
10.7 進行預測
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
11. 模型評估
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("ROC-AUC :", roc_auc_score(y_test, y_prob))
Classification Report
print(classification_report(y_test, y_pred))
Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
Confusion Matrix 視覺化
plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix")
plt.colorbar()
plt.xticks([0, 1], ["未取消", "已取消"])
plt.yticks([0, 1], ["未取消", "已取消"])
plt.xlabel("預測值")
plt.ylabel("實際值")
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, cm[i, j], ha="center", va="center")
plt.tight_layout()
plt.show()
12. 特徵重要性分析
# 取得 One-Hot Encoding 後的類別欄位名稱
ohe = clf.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)
# 合併全部欄位名稱
all_feature_names = np.concatenate([numeric_features, cat_feature_names])
# 取得特徵重要性
importances = clf.named_steps["model"].feature_importances_
feature_importance_df = pd.DataFrame({
"feature": all_feature_names,
"importance": importances
}).sort_values(by="importance", ascending=False)
print(feature_importance_df.head(20))
前 20 個重要特徵視覺化
top20 = feature_importance_df.head(20).sort_values(by="importance")
plt.figure(figsize=(10, 8))
plt.barh(top20["feature"], top20["importance"])
plt.title("前 20 個重要特徵")
plt.xlabel("importance")
plt.ylabel("feature")
plt.show()
13. 分析結論範例
你可以把這段直接寫進報告:
print("分析結論:")
print("1. 此資料集可用來分析旅客訂房行為與取消模式。")
print("2. 整體取消率可反映飯店訂單穩定程度。")
print("3. 不同飯店類型、訂金類型、客戶類型與市場來源的取消率存在差異。")
print("4. lead_time、deposit_type、market_segment、adr、special_requests 等變數通常對取消預測較重要。")
print("5. 使用機器學習模型可有效預測高風險取消訂單,協助飯店優化訂房策略。")