🔢 机器学习基础:从理论到实践的完整指南
- 2025-07-10 广东
本文字数:10556 字
阅读完需:约 35 分钟
🚀 导语:机器学习作为人工智能的核心技术,正在重塑我们的世界。本文将深入探讨机器学习的四大核心领域,从监督学习到特征工程,为你构建完整的机器学习知识体系。
📈 监督学习:分类与回归算法详解
🎯 监督学习概述
监督学习是机器学习中最重要的分支之一,通过已标记的训练数据来学习输入到输出的映射关系。
核心特点:
📊 有标签数据:训练集包含输入特征和对应的目标值
🎯 明确目标:预测新数据的标签或数值
📈 性能可评估:可通过测试集验证模型效果
🏷️ 分类算法深度解析
1. 逻辑回归(Logistic Regression)
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# 生成示例数据
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 训练逻辑回归模型
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
# 预测和评估
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"逻辑回归准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))
优势:
✅ 计算效率高
✅ 可解释性强
✅ 不需要特征缩放
✅ 输出概率值
2. 支持向量机(SVM)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 训练SVM模型
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# 预测和评估
y_pred_svm = svm_model.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM准确率: {accuracy_svm:.4f}")
3. 随机森林(Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# 预测和评估
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"随机森林准确率: {accuracy_rf:.4f}")
# 特征重要性可视化
feature_importance = rf_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importance)), feature_importance)
plt.title('随机森林特征重要性')
plt.xlabel('特征索引')
plt.ylabel('重要性')
plt.show()
📊 回归算法实战
1. 线性回归(Linear Regression)
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 生成回归数据
X_reg, y_reg = make_regression(n_samples=1000, n_features=1, noise=10, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 训练线性回归模型
linear_model = LinearRegression()
linear_model.fit(X_train_reg, y_train_reg)
# 预测和评估
y_pred_reg = linear_model.predict(X_test_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
print(f"线性回归 MSE: {mse:.4f}")
print(f"线性回归 R²: {r2:.4f}")
# 可视化结果
plt.figure(figsize=(10, 6))
plt.scatter(X_test_reg, y_test_reg, alpha=0.5, label='实际值')
plt.plot(X_test_reg, y_pred_reg, 'r-', label='预测值')
plt.xlabel('特征值')
plt.ylabel('目标值')
plt.title('线性回归预测结果')
plt.legend()
plt.show()
2. 多项式回归
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 创建多项式回归管道
poly_model = Pipeline([
('poly', PolynomialFeatures(degree=3)),
('linear', LinearRegression())
])
# 训练模型
poly_model.fit(X_train_reg, y_train_reg)
# 预测和评估
y_pred_poly = poly_model.predict(X_test_reg)
mse_poly = mean_squared_error(y_test_reg, y_pred_poly)
r2_poly = r2_score(y_test_reg, y_pred_poly)
print(f"多项式回归 MSE: {mse_poly:.4f}")
print(f"多项式回归 R²: {r2_poly:.4f}")
🎲 无监督学习:聚类与降维技术
🔍 无监督学习核心概念
无监督学习从无标签数据中发现隐藏的模式和结构,是数据挖掘和探索性数据分析的重要工具。
🎯 聚类算法详解
1. K-Means 聚类
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
# 生成聚类数据
X_cluster, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)
# K-Means聚类
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X_cluster)
# 可视化聚类结果
plt.figure(figsize=(12, 5))
# 原始数据
plt.subplot(1, 2, 1)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=y_true, cmap='viridis')
plt.title('真实聚类')
plt.xlabel('特征1')
plt.ylabel('特征2')
# K-Means结果
plt.subplot(1, 2, 2)
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=y_kmeans, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='x', s=200, linewidths=3, label='聚类中心')
plt.title('K-Means聚类结果')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.legend()
plt.tight_layout()
plt.show()
print(f"聚类中心: \n{kmeans.cluster_centers_}")
2. 层次聚类
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
# 层次聚类
hierarchical = AgglomerativeClustering(n_clusters=4, linkage='ward')
y_hierarchical = hierarchical.fit_predict(X_cluster)
# 绘制树状图
plt.figure(figsize=(12, 8))
linkage_matrix = linkage(X_cluster, method='ward')
dendrogram(linkage_matrix)
plt.title('层次聚类树状图')
plt.xlabel('样本索引')
plt.ylabel('距离')
plt.show()
3. DBSCAN 密度聚类
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)
# DBSCAN聚类
dbscan = DBSCAN(eps=0.3, min_samples=10)
y_dbscan = dbscan.fit_predict(X_scaled)
# 可视化结果
plt.figure(figsize=(10, 6))
unique_labels = set(y_dbscan)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# 噪声点用黑色表示
col = 'black'
marker = 'x'
else:
marker = 'o'
class_member_mask = (y_dbscan == k)
xy = X_cluster[class_member_mask]
plt.scatter(xy[:, 0], xy[:, 1], c=[col], marker=marker, s=50)
plt.title('DBSCAN聚类结果')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.show()
print(f"聚类数量: {len(set(y_dbscan)) - (1 if -1 in y_dbscan else 0)}")
print(f"噪声点数量: {list(y_dbscan).count(-1)}")
📉 降维技术实战
1. 主成分分析(PCA)
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd
# 加载鸢尾花数据集
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
# 应用PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_iris)
# 可视化PCA结果
plt.figure(figsize=(12, 5))
# 原始数据(选择两个特征)
plt.subplot(1, 2, 1)
plt.scatter(X_iris[:, 0], X_iris[:, 1], c=y_iris, cmap='viridis')
plt.xlabel('萼片长度')
plt.ylabel('萼片宽度')
plt.title('原始数据')
# PCA降维后的数据
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_iris, cmap='viridis')
plt.xlabel('第一主成分')
plt.ylabel('第二主成分')
plt.title('PCA降维结果')
plt.tight_layout()
plt.show()
print(f"解释方差比: {pca.explained_variance_ratio_}")
print(f"累计解释方差比: {pca.explained_variance_ratio_.cumsum()}")
2. t-SNE 非线性降维
from sklearn.manifold import TSNE
# 应用t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_iris)
# 可视化t-SNE结果
plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_iris, cmap='viridis')
plt.xlabel('t-SNE 维度1')
plt.ylabel('t-SNE 维度2')
plt.title('t-SNE降维结果')
plt.colorbar()
plt.show()
🎮 强化学习:从 Q-Learning 到深度强化学习
🎯 强化学习基础概念
强化学习是机器学习的第三大分支,通过与环境交互来学习最优策略。
核心要素:
🤖 智能体(Agent):学习和决策的主体
🌍 环境(Environment):智能体所处的外部世界
🎯 状态(State):环境的当前情况
⚡ 动作(Action):智能体可以执行的操作
🎁 奖励(Reward):环境对动作的反馈
📚 Q-Learning 算法实现
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
class QLearningAgent:
def __init__(self, actions, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
self.actions = actions
self.learning_rate = learning_rate
self.discount_factor = discount_factor
self.epsilon = epsilon
self.q_table = defaultdict(lambda: np.zeros(len(actions)))
def choose_action(self, state):
if np.random.random() < self.epsilon:
return np.random.choice(self.actions)
else:
return self.actions[np.argmax(self.q_table[state])]
def learn(self, state, action, reward, next_state):
current_q = self.q_table[state][action]
next_max_q = np.max(self.q_table[next_state])
new_q = current_q + self.learning_rate * (reward + self.discount_factor * next_max_q - current_q)
self.q_table[state][action] = new_q
# 简单的网格世界环境
class GridWorld:
def __init__(self, size=5):
self.size = size
self.state = (0, 0)
self.goal = (size-1, size-1)
self.actions = [0, 1, 2, 3] # 上、下、左、右
def reset(self):
self.state = (0, 0)
return self.state
def step(self, action):
x, y = self.state
if action == 0 and x > 0: # 上
x -= 1
elif action == 1 and x < self.size - 1: # 下
x += 1
elif action == 2 and y > 0: # 左
y -= 1
elif action == 3 and y < self.size - 1: # 右
y += 1
self.state = (x, y)
if self.state == self.goal:
reward = 100
done = True
else:
reward = -1
done = False
return self.state, reward, done
# 训练Q-Learning智能体
env = GridWorld()
agent = QLearningAgent(env.actions)
episodes = 1000
rewards_per_episode = []
for episode in range(episodes):
state = env.reset()
total_reward = 0
for step in range(100): # 最大步数限制
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
agent.learn(state, action, reward, next_state)
state = next_state
total_reward += reward
if done:
break
rewards_per_episode.append(total_reward)
# 可视化学习曲线
plt.figure(figsize=(10, 6))
plt.plot(rewards_per_episode)
plt.title('Q-Learning学习曲线')
plt.xlabel('回合数')
plt.ylabel('总奖励')
plt.show()
print(f"最后100回合平均奖励: {np.mean(rewards_per_episode[-100:])}")
🧠 深度 Q 网络(DQN)
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
class DQN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(DQN, self).__init__()
self.network = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, output_size)
)
def forward(self, x):
return self.network(x)
class DQNAgent:
def __init__(self, state_size, action_size, learning_rate=0.001):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=10000)
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
# 神经网络
self.q_network = DQN(state_size, 64, action_size)
self.target_network = DQN(state_size, 64, action_size)
self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate)
# 更新目标网络
self.update_target_network()
def update_target_network(self):
self.target_network.load_state_dict(self.q_network.state_dict())
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.random() <= self.epsilon:
return random.randrange(self.action_size)
state_tensor = torch.FloatTensor(state).unsqueeze(0)
q_values = self.q_network(state_tensor)
return np.argmax(q_values.cpu().data.numpy())
def replay(self, batch_size=32):
if len(self.memory) < batch_size:
return
batch = random.sample(self.memory, batch_size)
states = torch.FloatTensor([e[0] for e in batch])
actions = torch.LongTensor([e[1] for e in batch])
rewards = torch.FloatTensor([e[2] for e in batch])
next_states = torch.FloatTensor([e[3] for e in batch])
dones = torch.BoolTensor([e[4] for e in batch])
current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1))
next_q_values = self.target_network(next_states).max(1)[0].detach()
target_q_values = rewards + (0.99 * next_q_values * ~dones)
loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
print("DQN智能体已初始化,可用于复杂环境训练")
🛠️ 特征工程与数据预处理最佳实践
📊 数据预处理核心技术
特征工程是机器学习成功的关键,好的特征往往比复杂的算法更重要。
1. 数据清洗
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
# 创建示例数据集
np.random.seed(42)
data = {
'age': np.random.randint(18, 80, 1000),
'income': np.random.normal(50000, 15000, 1000),
'education': np.random.choice(['高中', '本科', '硕士', '博士'], 1000),
'score': np.random.normal(75, 10, 1000)
}
# 引入缺失值
data['income'][np.random.choice(1000, 50, replace=False)] = np.nan
data['education'][np.random.choice(1000, 30, replace=False)] = np.nan
df = pd.DataFrame(data)
print("原始数据信息:")
print(df.info())
print("\n缺失值统计:")
print(df.isnull().sum())
# 处理缺失值
# 数值型特征:使用均值填充
numeric_imputer = SimpleImputer(strategy='mean')
df['income'] = numeric_imputer.fit_transform(df[['income']]).ravel()
# 分类特征:使用众数填充
categorical_imputer = SimpleImputer(strategy='most_frequent')
df['education'] = categorical_imputer.fit_transform(df[['education']]).ravel()
print("\n处理后缺失值统计:")
print(df.isnull().sum())
2. 特征缩放
# 标准化(Z-score归一化)
scaler_standard = StandardScaler()
df_standard = df.copy()
df_standard[['age', 'income', 'score']] = scaler_standard.fit_transform(
df[['age', 'income', 'score']]
)
# 最小-最大缩放
scaler_minmax = MinMaxScaler()
df_minmax = df.copy()
df_minmax[['age', 'income', 'score']] = scaler_minmax.fit_transform(
df[['age', 'income', 'score']]
)
# 可视化缩放效果
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 原始数据
axes[0].boxplot([df['age'], df['income']/1000, df['score']],
labels=['年龄', '收入(千)', '分数'])
axes[0].set_title('原始数据')
axes[0].set_ylabel('数值')
# 标准化后
axes[1].boxplot([df_standard['age'], df_standard['income'], df_standard['score']],
labels=['年龄', '收入', '分数'])
axes[1].set_title('标准化后')
axes[1].set_ylabel('标准化数值')
# 最小-最大缩放后
axes[2].boxplot([df_minmax['age'], df_minmax['income'], df_minmax['score']],
labels=['年龄', '收入', '分数'])
axes[2].set_title('最小-最大缩放后')
axes[2].set_ylabel('缩放数值')
plt.tight_layout()
plt.show()
3. 特征编码
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
# 标签编码
label_encoder = LabelEncoder()
df['education_label'] = label_encoder.fit_transform(df['education'])
# 独热编码
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')
education_onehot = onehot_encoder.fit_transform(df[['education']])
education_columns = [f'education_{cat}' for cat in onehot_encoder.categories_[0][1:]]
education_df = pd.DataFrame(education_onehot, columns=education_columns)
# 合并编码结果
df_encoded = pd.concat([df, education_df], axis=1)
print("编码前教育特征:")
print(df['education'].value_counts())
print("\n标签编码结果:")
print(df['education_label'].value_counts())
print("\n独热编码结果:")
print(education_df.head())
4. 特征选择
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
# 创建目标变量(基于收入的二分类)
y = (df['income'] > df['income'].median()).astype(int)
X = df_encoded[['age', 'score', 'education_label'] + education_columns]
# 单变量特征选择
selector_univariate = SelectKBest(score_func=f_classif, k=3)
X_selected_univariate = selector_univariate.fit_transform(X, y)
# 递归特征消除
rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector_rfe = RFE(estimator=rf, n_features_to_select=3)
X_selected_rfe = selector_rfe.fit_transform(X, y)
# 特征重要性
rf.fit(X, y)
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
print("特征重要性排序:")
print(feature_importance)
# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.title('随机森林特征重要性')
plt.xlabel('重要性')
plt.ylabel('特征')
plt.show()
5. 特征构造
# 创建新特征
df_features = df.copy()
# 数值特征的多项式组合
df_features['age_squared'] = df_features['age'] ** 2
df_features['income_log'] = np.log1p(df_features['income'])
df_features['age_income_ratio'] = df_features['age'] / (df_features['income'] / 1000)
# 分箱特征
df_features['age_group'] = pd.cut(df_features['age'],
bins=[0, 30, 50, 70, 100],
labels=['青年', '中年', '中老年', '老年'])
df_features['income_level'] = pd.qcut(df_features['income'],
q=4,
labels=['低收入', '中低收入', '中高收入', '高收入'])
# 交互特征
df_features['education_score_interaction'] = (
df_features['education_label'] * df_features['score']
)
print("构造的新特征:")
print(df_features[['age_squared', 'income_log', 'age_income_ratio',
'age_group', 'income_level', 'education_score_interaction']].head())
🔧 完整的特征工程管道
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
# 定义预处理管道
numeric_features = ['age', 'income', 'score']
categorical_features = ['education']
# 数值特征预处理
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# 分类特征预处理
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(drop='first', sparse_output=False))
])
# 组合预处理器
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)
# 完整的机器学习管道
ml_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
# 交叉验证评估
scores = cross_val_score(ml_pipeline,
df[numeric_features + categorical_features],
y,
cv=5,
scoring='accuracy')
print(f"交叉验证准确率: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
# 训练最终模型
ml_pipeline.fit(df[numeric_features + categorical_features], y)
print("\n特征工程管道训练完成!")
🎯 总结与实践建议
📈 机器学习最佳实践
📊 数据质量优先
确保数据的准确性和完整性
处理异常值和缺失值
理解数据的业务含义
🔧 特征工程是关键
领域知识驱动的特征构造
合理的特征选择和降维
避免数据泄露
⚖️ 模型选择策略
从简单模型开始
根据问题类型选择合适算法
考虑可解释性需求
📏 评估与验证
使用合适的评估指标
交叉验证避免过拟合
在独立测试集上验证
🚀 进阶学习路径
# 学习路径代码示例
learning_path = {
"基础阶段": [
"掌握Python和相关库",
"理解统计学基础",
"熟悉经典算法"
],
"进阶阶段": [
"深度学习框架",
"特征工程技巧",
"模型调优方法"
],
"高级阶段": [
"MLOps实践",
"模型部署",
"A/B测试"
]
}
for stage, skills in learning_path.items():
print(f"\n{stage}:")
for skill in skills:
print(f" ✅ {skill}")
💡 实战项目建议
🏠 房价预测:回归问题入门
📧 垃圾邮件分类:文本分类实践
🛒 推荐系统:协同过滤算法
📈 股票预测:时间序列分析
🖼️ 图像识别:深度学习应用
🎉 结语:机器学习是一个不断发展的领域,理论与实践并重。通过系统学习监督学习、无监督学习、强化学习和特征工程,你将具备解决实际问题的能力。记住,最好的学习方式是动手实践!
📝 如果这篇文章对你有帮助,别忘了点赞、评论和分享,这对我持续创作非常重要!

野猪🐗 佩琪
还未添加个人签名 2020-09-29 加入
还未添加个人简介
评论