零基础学机器学习 - 黄佳

本书给出了一个机器学习入门路线图，从一个零基础的初学者视角出发，让读者跟着她一步一步向前，循序渐进地学习知识。包括机器学习、深度学习和强化学习的基础内容。

关于作者

黄佳是人工智能领域的技术作家和教育者：

AI 技术专家：专注于机器学习和深度学习领域
技术作家：著有多本 AI 和机器学习相关书籍
教育者：擅长用通俗易懂的方式讲解复杂的 AI 概念

黄佳以其"零基础学"系列的写作风格著称，通过大量图解和代码示例，帮助零基础的读者入门机器学习。

核心内容

1. 机器学习基础

# 机器学习三大类型

# 1. 监督学习 (Supervised Learning)
# 特点：有标签数据，学习输入到输出的映射
# 应用：分类、回归

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# 分类示例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# 2. 无监督学习 (Unsupervised Learning)
# 特点：无标签数据，发现数据内在结构
# 应用：聚类、降维、异常检测

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# 聚类示例
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X)

# 降维示例
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# 3. 强化学习 (Reinforcement Learning)
# 特点：通过与环境交互学习，最大化累积奖励
# 应用：游戏 AI、机器人控制、自动驾驶

import gym

env = gym.make('CartPole-v1')
# Agent 通过试错学习最优策略

2. 线性回归

import numpy as np
from sklearn.linear_model import LinearRegression

# 简单线性回归
# y = w * x + b

X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 5, 4, 5])

model = LinearRegression()
model.fit(X, y)

print(f"权重：{model.coef_[0]:.2f}")
print(f"截距：{model.intercept_:.2f}")
print(f"R² 分数：{model.score(X, y):.2f}")

# 损失函数：均方误差 (MSE)
# L = (1/n) * Σ(y_pred - y_true)²

# 梯度下降优化
def gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    bias = 0

    for _ in range(epochs):
        # 前向传播
        y_pred = np.dot(X, weights) + bias

        # 计算梯度
        dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
        db = (1/n_samples) * np.sum(y_pred - y)

        # 更新参数
        weights -= learning_rate * dw
        bias -= learning_rate * db

    return weights, bias

3. 逻辑回归与分类

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 逻辑回归：Sigmoid 函数
# P(y=1|x) = 1 / (1 + exp(-w*x - b))

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# 多分类策略
# 1. One-vs-Rest (OvR)
# 2. Softmax 回归

model = LogisticRegression(
    multi_class='multinomial',  # Softmax
    solver='lbfgs',
    max_iter=1000
)
model.fit(X_train, y_train)

# 评估
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 评估指标
# - 准确率 (Accuracy): 正确预测的比例
# - 精确率 (Precision): 预测为正的样本中有多少是真的正
# - 召回率 (Recall): 真的正样本中有多少被预测对了
# - F1 分数：精确率和召回率的调和平均

4. 决策树与随机森林

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# 决策树
# 核心：选择最佳分裂特征和分裂点
# 标准：信息增益、基尼系数

dt_model = DecisionTreeClassifier(
    max_depth=5,           # 最大深度
    min_samples_split=10,  # 内部节点分裂所需最小样本数
    min_samples_leaf=5,    # 叶节点所需最小样本数
    criterion='gini'       # 分裂标准：gini 或 entropy
)
dt_model.fit(X_train, y_train)

# 可视化决策树
plt.figure(figsize=(20, 10))
plot_tree(dt_model, feature_names=feature_names, filled=True)
plt.show()

# 随机森林：集成多个决策树
# 核心思想：Bagging + 特征随机选择

rf_model = RandomForestClassifier(
    n_estimators=100,      # 树的数量
    max_depth=10,
    min_samples_split=5,
    n_jobs=-1              # 并行训练
)
rf_model.fit(X_train, y_train)

# 特征重要性
importances = rf_model.feature_importances_

5. 支持向量机 (SVM)

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# SVM 核心思想：找到最大间隔的超平面
# 支持向量：距离超平面最近的样本点

# 核函数：处理非线性可分数据
# - linear: 线性核
# - poly: 多项式核
# - rbf: 径向基函数核 (最常用)
# - sigmoid: Sigmoid 核

# 构建pipeline：先标准化，再 SVM
model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=1.0, gamma='scale')
)

model.fit(X_train, y_train)

# 超参数：
# C: 正则化参数，越大越容易过拟合
# gamma: 核函数系数，越大越容易过拟合

6. 神经网络基础

import tensorflow as tf
from tensorflow import keras

# 多层感知机 (MLP)
# 包含：输入层、隐藏层、输出层

model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(n_features,)),
    keras.layers.Dropout(0.2),  # 防止过拟合
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(n_classes, activation='softmax')
])

# 编译模型
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 训练
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )
    ]
)

# 常用激活函数
# - ReLU: f(x) = max(0, x)  最常用
# - Sigmoid: f(x) = 1/(1+exp(-x))  输出层 (二分类)
# - Tanh: f(x) = (exp(x)-exp(-x))/(exp(x)+exp(-x))
# - Softmax: 多分类输出层

7. 深度学习进阶

# 卷积神经网络 (CNN) - 图像处理
cnn_model = keras.Sequential([
    keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)),
    keras.layers.MaxPooling2D(),
    keras.layers.Conv2D(64, 3, activation='relu'),
    keras.layers.MaxPooling2D(),
    keras.layers.Flatten(),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10, activation='softmax')
])

# 循环神经网络 (RNN) - 序列数据
rnn_model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 128, input_length=max_length),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.LSTM(32),
    keras.layers.Dense(1, activation='sigmoid')
])

# Transformer - 现代 NLP 基础
# 自注意力机制 + 位置编码 + 前馈网络

# 预训练模型 (迁移学习)
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')

# 冻结 BERT 参数，只训练分类头
bert.trainable = False

8. 强化学习入门

import gym
import numpy as np

# Q-Learning 算法
# Q(s, a): 在状态 s 下采取动作 a 的期望回报

class QLearningAgent:
    def __init__(self, n_states, n_actions, learning_rate=0.1, discount=0.95):
        self.q_table = np.zeros((n_states, n_actions))
        self.lr = learning_rate
        self.gamma = discount

    def get_action(self, state, epsilon=0.1):
        # ε-greedy 策略：平衡探索和利用
        if np.random.random() < epsilon:
            return np.random.randint(self.q_table.shape[1])
        else:
            return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state):
        # Q 值更新公式
        # Q(s,a) = Q(s,a) + α * (r + γ*max(Q(s',a')) - Q(s,a))
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.gamma * self.q_table[next_state][best_next_action]
        td_error = td_target - self.q_table[state][action]
        self.q_table[state][action] += self.lr * td_error

# 使用 gym 环境
env = gym.make('FrozenLake-v1')
agent = QLearningAgent(env.observation_space.n, env.action_space.n)

# 训练
for episode in range(10000):
    state = env.reset()
    done = False

    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.update(state, action, reward, next_state)
        state = next_state

9. 模型评估与调优

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# 交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"平均准确率：{scores.mean():.3f} (+/- {scores.std():.3f})")

# 网格搜索调参
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print(f"最佳参数：{grid_search.best_params_}")
print(f"最佳分数：{grid_search.best_score_:.3f}")

# ROC 曲线与 AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# 精确率 - 召回率曲线
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

经典摘录

机器学习不是魔法，它是数学和统计学的结合。理解背后的原理，才能更好地应用它。

数据决定上限，算法只是逼近这个上限。好的数据比复杂的算法更重要。

过拟合是机器学习的头号敌人。正则化、交叉验证、早停都是为了防止过拟合。

没有免费的午餐定理：没有一种算法在所有问题上都表现最好。选择合适的算法需要经验和实验。

从简单模型开始，逐步增加复杂度。如果简单模型已经足够好，就不要用复杂模型。

读书心得

《零基础学机器学习》是一本非常适合入门的机器学习教材。作者从零基础的视角出发，循序渐进地介绍了机器学习的核心概念和常用算法。

书中对我帮助最大的是由浅入深的知识体系。从最简单的线性回归开始，到逻辑回归、决策树、SVM，再到神经网络和深度学习，每一步都有清晰的代码示例和原理解释。这种编排方式让没有数学背景的读者也能跟上。

实战代码部分非常实用。书中使用了 scikit-learn、TensorFlow 等主流库，代码简洁易懂。通过动手实现这些示例，能够快速掌握机器学习的基本流程：数据准备、模型选择、训练、评估、调优。

强化学习部分虽然是入门级别，但也让读者对这个重要领域有了基本了解。Q-Learning 的实现示例简洁明了，展示了强化学习的核心思想。

对于想要入门机器学习的开发者来说，这本书是很好的起点。它不会让你成为专家，但会帮你建立正确的知识框架，为后续深入学习打下基础。

建议配合吴恩达的机器学习课程一起学习，理论 + 实践，效果会更好。

关于作者​

核心内容​

1. 机器学习基础​

2. 线性回归​

3. 逻辑回归与分类​

4. 决策树与随机森林​

5. 支持向量机 (SVM)​

6. 神经网络基础​

7. 深度学习进阶​

8. 强化学习入门​

9. 模型评估与调优​

经典摘录​

读书心得​