import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['STfangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
array1 = np.arange(1, 10).reshape(3, 3)
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
array2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
array([[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
# 水平方向拼接
np.hstack((array1, array2))
array([[1, 2, 3, 1, 1, 1],
[4, 5, 6, 2, 2, 2],
[7, 8, 9, 3, 3, 3]])
# 垂直方向拼接
np.vstack((array1, array2))
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
# 沿着指定的轴拼接
np.concatenate((array1, array2))
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
np.concatenate((array1, array2), axis=1)
array([[1, 2, 3, 1, 1, 1],
[4, 5, 6, 2, 2, 2],
[7, 8, 9, 3, 3, 3]])
# 垂直方向拆分
np.vsplit(array2, 3)
[array([[1, 1, 1]]), array([[2, 2, 2]]), array([[3, 3, 3]])]
# 水平方向拆分
np.hsplit(array2, 3)
# 在末尾追加元素
np.append(array1, 10)
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# 在指定位置插入元素
np.insert(array1, 0, 0)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array1[array1 % 3 == 0]
array([3, 6, 9])
# 根据条件筛选数据
np.extract(array1 % 3 == 0, array1)
array([3, 6, 9])
# 根据条件和公式获取数据
x = np.arange(10)
condlist = [x < 3, x > 5]
choicelist = [x, x ** 2]
np.select(condlist, choicelist, default=np.nan)
array([ 0., 1., 2., nan, nan, nan, 36., 49., 64., 81.])
# 根据条件和公式获取数据
np.where(x < 5, x, 10 * x)
array([ 0, 1, 2, 3, 4, 50, 60, 70, 80, 90])
def fib(n):
a, b = 0, 1
for _ in range(n):
a, b = b, a + b
yield a
gen = fib(20)
# 通过迭代器(生成器)创建数组对象
array3 = np.fromiter(gen, dtype=np.int64, count=10)
array([ 1, 1, 2, 3, 5, 8, 13, 21, 34, 55], dtype=int64)
# 调整数组的大小
np.resize(array1, (4, 4))
array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 1, 2, 3],
[4, 5, 6, 7]])
A ⋅ B = a 1 b 1 + a 2 b 2 = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = a_1b_1 + a_2b_2 = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=a1b1+a2b2=∣A∣∣B∣cosθ
A ⋅ B = ∑ i = 1 n a i b i = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = \sum_{i=1}^{n} a_ib_i = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=i=1∑naibi=∣A∣∣B∣cosθ
v1 = np.array([3, 5])
v2 = np.array([1, 3])
# inner_prod = np.dot(v1, v2)
inner_prod = np.inner(v1, v2)
print('向量点积:', inner_prod)
向量点积: 18
说明:在欧几里得几何中,两个笛卡尔坐标向量的点积也称为内积(inner product),但是内积的含义要高于点积,点积相当于是内积在欧几里得空间 $ \mathbb{R}^n $ 的特例,而内积可以推广到赋范向量空间。
v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
print('v1的模:', np.round(v1_norm, 6))
print('v2的模:', np.round(v2_norm, 6))
v1的模: 5.830952
v2的模: 3.162278
cos_theta = inner_prod / (v1_norm * v2_norm)
print('向量夹角余弦值:', cos_theta)
print('夹角:', np.arccos(cos_theta) * 180 / np.pi)
向量夹角余弦值: 0.9761870601839526
夹角: 12.52880770915155
d e t ( A ) = ∑ n ! ± a 1 α a 2 β a 3 γ ⋯ a n ω det(A) = \sum_{n!} \pm a_{1\alpha}a_{2\beta}a_{3\gamma} \cdots a_{n\omega}det(A)=n!∑±a1αa2βa3γ⋯anω
array4 = np.stack((v1, v2))
array([[3, 5],
[1, 3]])
d e t ∣ 3 5 1 3 ∣ = 4 det \begin{vmatrix} 3 & 5 \ 1 & 3 \end{vmatrix} = 4det∣∣∣∣3153∣∣∣∣=4
# 计算行列式的值
np.round(np.linalg.det(array4), 2)
d e t ∣ 1 2 3 4 5 6 7 8 9 ∣ = 0 det \begin{vmatrix} 1 & 2 & 3 \ 4 & 5 & 6 \ 7 & 8 & 9 \end{vmatrix} = 0det∣∣∣∣∣∣147258369∣∣∣∣∣∣=0
array1 = np.arange(1, 10).reshape((3, 3))
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
∣ 1 2 3 4 5 6 7 8 9 ∣ → ∣ 1 2 3 0 − 3 − 6 0 − 6 − 12 ∣ \begin{vmatrix} 1 & 2 & 3\ 4 & 5 & 6\ 7 & 8 & 9 \end{vmatrix} \quad \to \quad \begin{vmatrix} 1 & 2 & 3\ 0 & -3 & -6\ 0 & -6 & -12 \end{vmatrix}∣∣∣∣∣∣147258369∣∣∣∣∣∣→∣∣∣∣∣∣1002−3−63−6−12∣∣∣∣∣∣
# 求逆矩阵
# LinAlgError ---> Singluar matrix ---> 奇异矩阵不能求逆
# np.linalg.inv(array1)
array2 = np.array([[1, 2], [3, 4]])
array([[1, 2],
[3, 4]])
array3 = np.linalg.inv(array2)
array([[-2. , 1. ],
[ 1.5, -0.5]])
A ⋅ A − 1 = I A \cdot A^{-1} = IA⋅A−1=I
np.round(array2 @ array3, 2)
array([[1., 0.],
[0., 1.]])
# 求矩阵的秩
array1[2, 2] = 8
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 8]])
{ 3 x + y = 9 x + 2 y = 8 \begin{cases} 3x + y = 9 \ x + 2y = 8 \end{cases}{3x+y=9x+2y=8
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8]).reshape(-1, 1)
np.linalg.solve(A, b)
A x = b A − 1 A x = A − 1 b I x = A − 1 b Ax = b\ A^{-1}Ax = A^{-1}b\ Ix = A^{-1}bAx=bA−1Ax=A−1bIx=A−1b
A_1 = np.linalg.inv(A)
array([[ 0.4, -0.2],
[-0.2, 0.6]])
A_1 @ b
from sklearn.datasets import load_boston
# 获取波士顿房价数据
dataset = load_boston()
.. _boston_dataset:
Boston house prices dataset
**Data Set Characteristics:**
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
.. topic:: References
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
(506, 13)
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
# 用波士顿房价数据创建DataFrame对象
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 391.99 | 9.67 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 396.90 | 9.08 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 396.90 | 5.64 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 393.45 | 6.48 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 396.90 | 7.88 |
506 rows × 13 columns
# 添加房价列
df['PRICE'] = dataset.target
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
# 计算协方差
CRIM | 73.986578 | -40.215956 | 23.992339 | -0.122109 | 0.419594 | -1.325038 | 85.405322 | -6.876722 | 46.847761 | 844.821538 | 5.399331 | -302.381816 | 27.986168 | -30.718508 |
ZN | -40.215956 | 543.936814 | -85.412648 | -0.252925 | -1.396148 | 5.112513 | -373.901548 | 32.629304 | -63.348695 | -1236.453735 | -19.776571 | 373.721402 | -68.783037 | 77.315176 |
INDUS | 23.992339 | -85.412648 | 47.064442 | 0.109669 | 0.607074 | -1.887957 | 124.513903 | -10.228097 | 35.549971 | 833.360290 | 5.692104 | -223.579756 | 29.580270 | -30.520823 |
CHAS | -0.122109 | -0.252925 | 0.109669 | 0.064513 | 0.002684 | 0.016285 | 0.618571 | -0.053043 | -0.016296 | -1.523367 | -0.066819 | 1.131325 | -0.097816 | 0.409409 |
NOX | 0.419594 | -1.396148 | 0.607074 | 0.002684 | 0.013428 | -0.024603 | 2.385927 | -0.187696 | 0.616929 | 13.046286 | 0.047397 | -4.020570 | 0.488946 | -0.455412 |
RM | -1.325038 | 5.112513 | -1.887957 | 0.016285 | -0.024603 | 0.493671 | -4.751929 | 0.303663 | -1.283815 | -34.583448 | -0.540763 | 8.215006 | -3.079741 | 4.493446 |
AGE | 85.405322 | -373.901548 | 124.513903 | 0.618571 | 2.385927 | -4.751929 | 792.358399 | -44.329379 | 111.770846 | 2402.690122 | 15.936921 | -702.940328 | 121.077725 | -97.589017 |
DIS | -6.876722 | 32.629304 | -10.228097 | -0.053043 | -0.187696 | 0.303663 | -44.329379 | 4.434015 | -9.068252 | -189.664592 | -1.059775 | 56.040356 | -7.473329 | 4.840229 |
RAD | 46.847761 | -63.348695 | 35.549971 | -0.016296 | 0.616929 | -1.283815 | 111.770846 | -9.068252 | 75.816366 | 1335.756577 | 8.760716 | -353.276219 | 30.385442 | -30.561228 |
TAX | 844.821538 | -1236.453735 | 833.360290 | -1.523367 | 13.046286 | -34.583448 | 2402.690122 | -189.664592 | 1335.756577 | 28404.759488 | 168.153141 | -6797.911215 | 654.714520 | -726.255716 |
PTRATIO | 5.399331 | -19.776571 | 5.692104 | -0.066819 | 0.047397 | -0.540763 | 15.936921 | -1.059775 | 8.760716 | 168.153141 | 4.686989 | -35.059527 | 5.782729 | -10.110657 |
B | -302.381816 | 373.721402 | -223.579756 | 1.131325 | -4.020570 | 8.215006 | -702.940328 | 56.040356 | -353.276219 | -6797.911215 | -35.059527 | 8334.752263 | -238.667516 | 279.989834 |
LSTAT | 27.986168 | -68.783037 | 29.580270 | -0.097816 | 0.488946 | -3.079741 | 121.077725 | -7.473329 | 30.385442 | 654.714520 | 5.782729 | -238.667516 | 50.994760 | -48.447538 |
PRICE | -30.718508 | 77.315176 | -30.520823 | 0.409409 | -0.455412 | 4.493446 | -97.589017 | 4.840229 | -30.561228 | -726.255716 | -10.110657 | 279.989834 | -48.447538 | 84.586724 |
# 计算皮尔逊相关系数
np.round(df.corr(), 2)
CRIM | 1.00 | -0.20 | 0.41 | -0.06 | 0.42 | -0.22 | 0.35 | -0.38 | 0.63 | 0.58 | 0.29 | -0.39 | 0.46 | -0.39 |
ZN | -0.20 | 1.00 | -0.53 | -0.04 | -0.52 | 0.31 | -0.57 | 0.66 | -0.31 | -0.31 | -0.39 | 0.18 | -0.41 | 0.36 |
INDUS | 0.41 | -0.53 | 1.00 | 0.06 | 0.76 | -0.39 | 0.64 | -0.71 | 0.60 | 0.72 | 0.38 | -0.36 | 0.60 | -0.48 |
CHAS | -0.06 | -0.04 | 0.06 | 1.00 | 0.09 | 0.09 | 0.09 | -0.10 | -0.01 | -0.04 | -0.12 | 0.05 | -0.05 | 0.18 |
NOX | 0.42 | -0.52 | 0.76 | 0.09 | 1.00 | -0.30 | 0.73 | -0.77 | 0.61 | 0.67 | 0.19 | -0.38 | 0.59 | -0.43 |
RM | -0.22 | 0.31 | -0.39 | 0.09 | -0.30 | 1.00 | -0.24 | 0.21 | -0.21 | -0.29 | -0.36 | 0.13 | -0.61 | 0.70 |
AGE | 0.35 | -0.57 | 0.64 | 0.09 | 0.73 | -0.24 | 1.00 | -0.75 | 0.46 | 0.51 | 0.26 | -0.27 | 0.60 | -0.38 |
DIS | -0.38 | 0.66 | -0.71 | -0.10 | -0.77 | 0.21 | -0.75 | 1.00 | -0.49 | -0.53 | -0.23 | 0.29 | -0.50 | 0.25 |
RAD | 0.63 | -0.31 | 0.60 | -0.01 | 0.61 | -0.21 | 0.46 | -0.49 | 1.00 | 0.91 | 0.46 | -0.44 | 0.49 | -0.38 |
TAX | 0.58 | -0.31 | 0.72 | -0.04 | 0.67 | -0.29 | 0.51 | -0.53 | 0.91 | 1.00 | 0.46 | -0.44 | 0.54 | -0.47 |
PTRATIO | 0.29 | -0.39 | 0.38 | -0.12 | 0.19 | -0.36 | 0.26 | -0.23 | 0.46 | 0.46 | 1.00 | -0.18 | 0.37 | -0.51 |
B | -0.39 | 0.18 | -0.36 | 0.05 | -0.38 | 0.13 | -0.27 | 0.29 | -0.44 | -0.44 | -0.18 | 1.00 | -0.37 | 0.33 |
LSTAT | 0.46 | -0.41 | 0.60 | -0.05 | 0.59 | -0.61 | 0.60 | -0.50 | 0.49 | 0.54 | 0.37 | -0.37 | 1.00 | -0.74 |
PRICE | -0.39 | 0.36 | -0.48 | 0.18 | -0.43 | 0.70 | -0.38 | 0.25 | -0.38 | -0.47 | -0.51 | 0.33 | -0.74 | 1.00 |
rooms = df['RM'].values
prices = df['PRICE'].values
history_data = {room: price for room, price in zip(rooms, prices)}
import heapq
nums = [35, 98, 76, 12, 55, 68, 47, 92]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums))
[98, 92, 76]
[12, 35, 47]
import heapq
# kNN算法
def predict_price_by_knn(history_data, room, k=5):
# keys = sorted(history_data, key=lambda x: (x - room) ** 2)[:k]
keys = heapq.nsmallest(k, history_data, key=lambda x: (x - room) ** 2)
return np.mean([history_data[key] for key in keys])
# 预测房价
np.round(predict_price_by_knn(history_data, 6.25), 2)
np.round(predict_price_by_knn(history_data, 5.125), 2)
# 通过散点图研究变量的关系
plt.scatter(rooms, prices)
回归方程:x xx 代表房间数,y yy 就是要预测的房价。
y = a x + b y = ax + by=ax+b
均方误差:让均方误差最小的 a aa 和 b bb 就是最佳拟合。
M S E = 1 N ∑ ( y i ^ − y i ) 2 MSE = \frac{1} {N} \sum (\hat{y_i} - y_i)^2MSE=N1∑(yi^−yi)2
def get_loss(x, y, a, b):
y_hat = a * x + b
return np.mean((y_hat - y) ** 2)
# 通过蒙特卡罗模拟找到实现最佳拟合的a和b的值
import random
best_a, best_b = None, None
min_loss = np.inf
for _ in range(1000):
# 随机产生a和b的值
a = random.random() * 200 - 100
b = random.random() * 200 - 100
# 计算损失(MSE)
curr_loss = get_loss(rooms, prices, a, b)
# 让损失更小的a和b就是更好的拟合
if curr_loss < min_loss:
min_loss = curr_loss
best_a, best_b = a, b
print(best_a, best_b)
12.414266461017732 -56.48722240398021
a ′ = a + ( − 1 ) × ∂ l o s s ( a , b ) ∂ a × Δ a^\prime = a + (-1) \times \frac {\partial loss(a, b)} {\partial a} \times \Deltaa′=a+(−1)×∂a∂loss(a,b)×Δ
b ′ = b + ( − 1 ) × ∂ l o s s ( a , b ) ∂ b × Δ b^\prime = b + (-1) \times \frac {\partial loss(a, b)} {\partial b} \times \Deltab′=b+(−1)×∂b∂loss(a,b)×Δ
f ( a , b ) = 1 N ∑ i = 1 N ( y i − ( a x i + b ) ) 2 f(a, b) = \frac {1} {N} \sum_{i=1}^{N}(y_i - (ax_i + b))^2f(a,b)=N1i=1∑N(yi−(axi+b))2
∂ f ( a , b ) ∂ a = 2 N ∑ i = 1 N ( − x i y i + x i 2 a + x i b ) \frac {\partial {f(a, b)}} {\partial {a}} = \frac {2} {N} \sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib)∂a∂f(a,b)=N2i=1∑N(−xiyi+xi2a+xib)
∂ f ( a , b ) ∂ b = 2 N ∑ i = 1 N ( − y i + x i a + b ) \frac {\partial {f(a, b)}} {\partial {b}} = \frac {2} {N} \sum_{i=1}^{N}(-y_i + x_ia + b)∂b∂f(a,b)=N2i=1∑N(−yi+xia+b)
# 求a的偏导数
def partial_a(x, y, a, b):
return 2 * np.mean((y - a * x - b) * (-x))
# 求b的偏导数
def partial_b(x, y, a, b):
return 2 * np.mean(-y + a * x + b)
# 通过梯度下降的方式向拐点逼近
# 这种方式能够更快的找到最佳拟合的a和b
# a和b的初始值可以随意设定,delta的值要足够小
a, b = 35, -35
delta = 0.01
for _ in range(100):
a = a - partial_a(rooms, prices, a, b) * delta
b = b - partial_b(rooms, prices, a, b) * delta
print(a, b)
print(get_loss(rooms, prices, a, b))
9.276809660789766 -35.781905844032686
# 通过线性回归方程预测房价
def predict_price_by_regression(a, b, x):
return a * x + b
# 预测房价
print(np.round(predict_price_by_regression(best_a, best_b, 6.25), 2))
print(np.round(predict_price_by_regression(a, b, 6.25), 2))
print(np.round(predict_price_by_regression(best_a, best_b, 5.12), 2))
print(np.round(predict_price_by_regression(a, b, 5.12), 2))
# 比较两条拟合曲线
y_hat1 = best_a * rooms + best_b
y_hat2 = a * rooms + b
plt.scatter(rooms, prices)
plt.plot(rooms, y_hat1, color='red', linewidth=4)
plt.plot(rooms, y_hat2, color='green', linewidth=4)
y = a x + b y = ax + by=ax+b
函数的第一个参数是$ \begin{bmatrix} x \ 1 \ \end{bmatrix} ^T $,第二个参数就是y
# lstsq函数的第一个参数
param1 = np.vstack([rooms, np.ones(rooms.size)]).T
array([[6.575, 1. ],
[6.421, 1. ],
[7.185, 1. ],
[6.976, 1. ],
[6.794, 1. ],
[6.03 , 1. ]])
# lstsq函数的第二个参数
param2 = prices
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8,
7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1,
12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9,
27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4,
8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. ,
9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8,
10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5,
23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
# rcond参数直接设置为None(暂不解释)
result = np.linalg.lstsq(param1, param2, rcond=None)
(array([ 9.10210898, -34.67062078]),
array([143.99484122, 2.46656609]))
a, b = result[0]
mse = result[1][0] / rooms.size
print(a, b)
9.102108981180313 -34.67062077643857
# 比较两条拟合曲线
plt.scatter(rooms, prices)
# 梯度下降法给出的a和b预测出的房价
plt.plot(rooms, y_hat2, color='red', linewidth=4)
# lstsq函数给出的a和b预测出的房价
y_hat3 = a * rooms + b
plt.plot(rooms, y_hat3, color='green', linewidth=4)
