包的导入以及图像上中文字体设置、图像清晰度设置
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['STfangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
array1 = np.arange(1, 10).reshape(3, 3)
array1
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
array2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
array2
array([[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
# 水平方向拼接
np.hstack((array1, array2))
array([[1, 2, 3, 1, 1, 1],
[4, 5, 6, 2, 2, 2],
[7, 8, 9, 3, 3, 3]])
# 垂直方向拼接
np.vstack((array1, array2))
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
# 沿着指定的轴拼接
np.concatenate((array1, array2))
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[1, 1, 1],
[2, 2, 2],
[3, 3, 3]])
np.concatenate((array1, array2), axis=1)
array([[1, 2, 3, 1, 1, 1],
[4, 5, 6, 2, 2, 2],
[7, 8, 9, 3, 3, 3]])
# 垂直方向拆分
np.vsplit(array2, 3)
[array([[1, 1, 1]]), array([[2, 2, 2]]), array([[3, 3, 3]])]
# 水平方向拆分
np.hsplit(array2, 3)
[array([[1],
[2],
[3]]),
array([[1],
[2],
[3]]),
array([[1],
[2],
[3]])]
# 在末尾追加元素
np.append(array1, 10)
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# 在指定位置插入元素
np.insert(array1, 0, 0)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array1[array1 % 3 == 0]
array([3, 6, 9])
# 根据条件筛选数据
np.extract(array1 % 3 == 0, array1)
array([3, 6, 9])
# 根据条件和公式获取数据
x = np.arange(10)
condlist = [x < 3, x > 5]
choicelist = [x, x ** 2]
np.select(condlist, choicelist, default=np.nan)
array([ 0., 1., 2., nan, nan, nan, 36., 49., 64., 81.])
# 根据条件和公式获取数据
np.where(x < 5, x, 10 * x)
array([ 0, 1, 2, 3, 4, 50, 60, 70, 80, 90])
def fib(n):
a, b = 0, 1
for _ in range(n):
a, b = b, a + b
yield a
gen = fib(20)
# 通过迭代器(生成器)创建数组对象
array3 = np.fromiter(gen, dtype=np.int64, count=10)
array3
array([ 1, 1, 2, 3, 5, 8, 13, 21, 34, 55], dtype=int64)
# 调整数组的大小
np.resize(array1, (4, 4))
array([[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 1, 2, 3],
[4, 5, 6, 7]])
A ⋅ B = a 1 b 1 + a 2 b 2 = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = a_1b_1 + a_2b_2 = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=a1b1+a2b2=∣A∣∣B∣cosθ
A ⋅ B = ∑ i = 1 n a i b i = ∣ A ∣ ∣ B ∣ c o s θ A \cdot B = \sum_{i=1}^{n} a_ib_i = \lvert A \rvert \lvert B \rvert cos \thetaA⋅B=i=1∑naibi=∣A∣∣B∣cosθ
v1 = np.array([3, 5])
v2 = np.array([1, 3])
# inner_prod = np.dot(v1, v2)
inner_prod = np.inner(v1, v2)
print('向量点积:', inner_prod)
向量点积: 18
说明:在欧几里得几何中,两个笛卡尔坐标向量的点积也称为内积(inner product),但是内积的含义要高于点积,点积相当于是内积在欧几里得空间 $ \mathbb{R}^n $ 的特例,而内积可以推广到赋范向量空间。
v1_norm = np.linalg.norm(v1)
v2_norm = np.linalg.norm(v2)
print('v1的模:', np.round(v1_norm, 6))
print('v2的模:', np.round(v2_norm, 6))
v1的模: 5.830952
v2的模: 3.162278
cos_theta = inner_prod / (v1_norm * v2_norm)
print('向量夹角余弦值:', cos_theta)
print('夹角:', np.arccos(cos_theta) * 180 / np.pi)
向量夹角余弦值: 0.9761870601839526
夹角: 12.52880770915155
d e t ( A ) = ∑ n ! ± a 1 α a 2 β a 3 γ ⋯ a n ω det(A) = \sum_{n!} \pm a_{1\alpha}a_{2\beta}a_{3\gamma} \cdots a_{n\omega}det(A)=n!∑±a1αa2βa3γ⋯anω
array4 = np.stack((v1, v2))
array4
array([[3, 5],
[1, 3]])
d e t ∣ 3 5 1 3 ∣ = 4 det \begin{vmatrix} 3 & 5 \ 1 & 3 \end{vmatrix} = 4det∣∣∣∣3153∣∣∣∣=4
# 计算行列式的值
np.round(np.linalg.det(array4), 2)
4.0
d e t ∣ 1 2 3 4 5 6 7 8 9 ∣ = 0 det \begin{vmatrix} 1 & 2 & 3 \ 4 & 5 & 6 \ 7 & 8 & 9 \end{vmatrix} = 0det∣∣∣∣∣∣147258369∣∣∣∣∣∣=0
np.linalg.det(array1)
0.0
array1 = np.arange(1, 10).reshape((3, 3))
array1
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
对上面的array1
做一组线性变换,就知道为什么它的秩是2
了。
∣ 1 2 3 4 5 6 7 8 9 ∣ → ∣ 1 2 3 0 − 3 − 6 0 − 6 − 12 ∣ \begin{vmatrix} 1 & 2 & 3\ 4 & 5 & 6\ 7 & 8 & 9 \end{vmatrix} \quad \to \quad \begin{vmatrix} 1 & 2 & 3\ 0 & -3 & -6\ 0 & -6 & -12 \end{vmatrix}∣∣∣∣∣∣147258369∣∣∣∣∣∣→∣∣∣∣∣∣1002−3−63−6−12∣∣∣∣∣∣
# 求逆矩阵
# LinAlgError ---> Singluar matrix ---> 奇异矩阵不能求逆
# np.linalg.inv(array1)
array2 = np.array([[1, 2], [3, 4]])
array2
array([[1, 2],
[3, 4]])
array3 = np.linalg.inv(array2)
array3
array([[-2. , 1. ],
[ 1.5, -0.5]])
A ⋅ A − 1 = I A \cdot A^{-1} = IA⋅A−1=I
np.round(array2 @ array3, 2)
array([[1., 0.],
[0., 1.]])
# 求矩阵的秩
np.linalg.matrix_rank(array1)
2
array1[2, 2] = 8
array1
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 8]])
np.linalg.matrix_rank(array1)
3
解线性方程:
{ 3 x + y = 9 x + 2 y = 8 \begin{cases} 3x + y = 9 \ x + 2y = 8 \end{cases}{3x+y=9x+2y=8
A = np.array([[3, 1], [1, 2]])
b = np.array([9, 8]).reshape(-1, 1)
np.linalg.solve(A, b)
array([[2.],
[3.]])
A x = b A − 1 A x = A − 1 b I x = A − 1 b Ax = b\ A^{-1}Ax = A^{-1}b\ Ix = A^{-1}bAx=bA−1Ax=A−1bIx=A−1b
A_1 = np.linalg.inv(A)
A_1
array([[ 0.4, -0.2],
[-0.2, 0.6]])
A_1 @ b
array([[2.],
[3.]])
!pip install scikit-learn
Looking in indexes: https://pypi.doubanio.com/simple
Requirement already satisfied: scikit-learn in d:\programs\python\python38\lib\site-packages (0.24.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (2.2.0)
Requirement already satisfied: numpy>=1.13.3 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.21.2)
Requirement already satisfied: joblib>=0.11 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.0.1)
Requirement already satisfied: scipy>=0.19.1 in d:\programs\python\python38\lib\site-packages (from scikit-learn) (1.7.1)
from sklearn.datasets import load_boston
# 获取波士顿房价数据
dataset = load_boston()
print(dataset.DESCR)
.. _boston_dataset:
Boston house prices dataset
---------------------------
**Data Set Characteristics:**
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of black people by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
problems.
.. topic:: References
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
dataset.data.shape
(506, 13)
dataset.feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
# 用波士顿房价数据创建DataFrame对象
df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 391.99 | 9.67 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 396.90 | 9.08 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 396.90 | 5.64 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 393.45 | 6.48 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 396.90 | 7.88 |
506 rows × 13 columns
# 添加房价列
df['PRICE'] = dataset.target
df
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | PRICE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
# 计算协方差
df.cov()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | PRICE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CRIM | 73.986578 | -40.215956 | 23.992339 | -0.122109 | 0.419594 | -1.325038 | 85.405322 | -6.876722 | 46.847761 | 844.821538 | 5.399331 | -302.381816 | 27.986168 | -30.718508 |
ZN | -40.215956 | 543.936814 | -85.412648 | -0.252925 | -1.396148 | 5.112513 | -373.901548 | 32.629304 | -63.348695 | -1236.453735 | -19.776571 | 373.721402 | -68.783037 | 77.315176 |
INDUS | 23.992339 | -85.412648 | 47.064442 | 0.109669 | 0.607074 | -1.887957 | 124.513903 | -10.228097 | 35.549971 | 833.360290 | 5.692104 | -223.579756 | 29.580270 | -30.520823 |
CHAS | -0.122109 | -0.252925 | 0.109669 | 0.064513 | 0.002684 | 0.016285 | 0.618571 | -0.053043 | -0.016296 | -1.523367 | -0.066819 | 1.131325 | -0.097816 | 0.409409 |
NOX | 0.419594 | -1.396148 | 0.607074 | 0.002684 | 0.013428 | -0.024603 | 2.385927 | -0.187696 | 0.616929 | 13.046286 | 0.047397 | -4.020570 | 0.488946 | -0.455412 |
RM | -1.325038 | 5.112513 | -1.887957 | 0.016285 | -0.024603 | 0.493671 | -4.751929 | 0.303663 | -1.283815 | -34.583448 | -0.540763 | 8.215006 | -3.079741 | 4.493446 |
AGE | 85.405322 | -373.901548 | 124.513903 | 0.618571 | 2.385927 | -4.751929 | 792.358399 | -44.329379 | 111.770846 | 2402.690122 | 15.936921 | -702.940328 | 121.077725 | -97.589017 |
DIS | -6.876722 | 32.629304 | -10.228097 | -0.053043 | -0.187696 | 0.303663 | -44.329379 | 4.434015 | -9.068252 | -189.664592 | -1.059775 | 56.040356 | -7.473329 | 4.840229 |
RAD | 46.847761 | -63.348695 | 35.549971 | -0.016296 | 0.616929 | -1.283815 | 111.770846 | -9.068252 | 75.816366 | 1335.756577 | 8.760716 | -353.276219 | 30.385442 | -30.561228 |
TAX | 844.821538 | -1236.453735 | 833.360290 | -1.523367 | 13.046286 | -34.583448 | 2402.690122 | -189.664592 | 1335.756577 | 28404.759488 | 168.153141 | -6797.911215 | 654.714520 | -726.255716 |
PTRATIO | 5.399331 | -19.776571 | 5.692104 | -0.066819 | 0.047397 | -0.540763 | 15.936921 | -1.059775 | 8.760716 | 168.153141 | 4.686989 | -35.059527 | 5.782729 | -10.110657 |
B | -302.381816 | 373.721402 | -223.579756 | 1.131325 | -4.020570 | 8.215006 | -702.940328 | 56.040356 | -353.276219 | -6797.911215 | -35.059527 | 8334.752263 | -238.667516 | 279.989834 |
LSTAT | 27.986168 | -68.783037 | 29.580270 | -0.097816 | 0.488946 | -3.079741 | 121.077725 | -7.473329 | 30.385442 | 654.714520 | 5.782729 | -238.667516 | 50.994760 | -48.447538 |
PRICE | -30.718508 | 77.315176 | -30.520823 | 0.409409 | -0.455412 | 4.493446 | -97.589017 | 4.840229 | -30.561228 | -726.255716 | -10.110657 | 279.989834 | -48.447538 | 84.586724 |
# 计算皮尔逊相关系数
np.round(df.corr(), 2)
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | PRICE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
CRIM | 1.00 | -0.20 | 0.41 | -0.06 | 0.42 | -0.22 | 0.35 | -0.38 | 0.63 | 0.58 | 0.29 | -0.39 | 0.46 | -0.39 |
ZN | -0.20 | 1.00 | -0.53 | -0.04 | -0.52 | 0.31 | -0.57 | 0.66 | -0.31 | -0.31 | -0.39 | 0.18 | -0.41 | 0.36 |
INDUS | 0.41 | -0.53 | 1.00 | 0.06 | 0.76 | -0.39 | 0.64 | -0.71 | 0.60 | 0.72 | 0.38 | -0.36 | 0.60 | -0.48 |
CHAS | -0.06 | -0.04 | 0.06 | 1.00 | 0.09 | 0.09 | 0.09 | -0.10 | -0.01 | -0.04 | -0.12 | 0.05 | -0.05 | 0.18 |
NOX | 0.42 | -0.52 | 0.76 | 0.09 | 1.00 | -0.30 | 0.73 | -0.77 | 0.61 | 0.67 | 0.19 | -0.38 | 0.59 | -0.43 |
RM | -0.22 | 0.31 | -0.39 | 0.09 | -0.30 | 1.00 | -0.24 | 0.21 | -0.21 | -0.29 | -0.36 | 0.13 | -0.61 | 0.70 |
AGE | 0.35 | -0.57 | 0.64 | 0.09 | 0.73 | -0.24 | 1.00 | -0.75 | 0.46 | 0.51 | 0.26 | -0.27 | 0.60 | -0.38 |
DIS | -0.38 | 0.66 | -0.71 | -0.10 | -0.77 | 0.21 | -0.75 | 1.00 | -0.49 | -0.53 | -0.23 | 0.29 | -0.50 | 0.25 |
RAD | 0.63 | -0.31 | 0.60 | -0.01 | 0.61 | -0.21 | 0.46 | -0.49 | 1.00 | 0.91 | 0.46 | -0.44 | 0.49 | -0.38 |
TAX | 0.58 | -0.31 | 0.72 | -0.04 | 0.67 | -0.29 | 0.51 | -0.53 | 0.91 | 1.00 | 0.46 | -0.44 | 0.54 | -0.47 |
PTRATIO | 0.29 | -0.39 | 0.38 | -0.12 | 0.19 | -0.36 | 0.26 | -0.23 | 0.46 | 0.46 | 1.00 | -0.18 | 0.37 | -0.51 |
B | -0.39 | 0.18 | -0.36 | 0.05 | -0.38 | 0.13 | -0.27 | 0.29 | -0.44 | -0.44 | -0.18 | 1.00 | -0.37 | 0.33 |
LSTAT | 0.46 | -0.41 | 0.60 | -0.05 | 0.59 | -0.61 | 0.60 | -0.50 | 0.49 | 0.54 | 0.37 | -0.37 | 1.00 | -0.74 |
PRICE | -0.39 | 0.36 | -0.48 | 0.18 | -0.43 | 0.70 | -0.38 | 0.25 | -0.38 | -0.47 | -0.51 | 0.33 | -0.74 | 1.00 |
rooms = df['RM'].values
prices = df['PRICE'].values
history_data = {room: price for room, price in zip(rooms, prices)}
history_data
{6.575: 24.0,
6.421: 21.6,
7.185: 34.9,
6.998: 33.4,
7.147: 36.2,
6.43: 28.7,
6.012: 22.9,
6.172: 27.1,
5.631: 16.5,
6.004: 20.3,
6.377: 15.0,
6.009: 21.7,
5.889: 21.7,
5.949: 20.4,
6.096: 13.5,
5.834: 19.9,
5.935: 8.4,
5.99: 17.5,
5.456: 20.2,
5.727: 18.2,
5.57: 13.6,
5.965: 19.6,
6.142: 15.2,
5.813: 16.6,
5.924: 15.6,
5.599: 13.9,
6.047: 14.8,
6.495: 26.4,
6.674: 21.0,
5.713: 20.1,
6.072: 14.5,
5.95: 13.2,
5.701: 13.1,
5.933: 18.9,
5.841: 20.0,
5.85: 21.0,
5.966: 16.0,
6.595: 30.8,
7.024: 34.9,
6.77: 26.6,
6.169: 25.3,
6.211: 25.0,
6.069: 21.2,
5.682: 19.3,
5.786: 20.0,
6.03: 11.9,
5.399: 14.4,
5.602: 19.4,
5.963: 19.7,
6.115: 20.5,
6.511: 25.0,
5.998: 23.4,
5.888: 23.3,
7.249: 35.4,
6.383: 24.7,
6.816: 31.6,
6.145: 23.3,
5.927: 19.6,
5.741: 18.7,
6.456: 22.2,
6.762: 25.0,
7.104: 33.0,
6.29: 23.5,
5.787: 19.4,
5.878: 22.0,
5.594: 17.4,
5.885: 20.9,
6.417: 13.0,
5.961: 20.5,
6.065: 22.8,
6.245: 23.4,
6.273: 24.1,
6.286: 21.4,
6.279: 20.0,
6.14: 20.8,
6.232: 21.2,
5.874: 20.3,
6.727: 27.5,
6.619: 23.9,
6.302: 24.8,
6.167: 19.9,
6.389: 23.9,
6.63: 27.9,
6.015: 22.5,
6.121: 22.2,
7.007: 23.6,
7.079: 28.7,
6.405: 12.5,
6.442: 22.9,
6.249: 20.6,
6.625: 28.4,
6.163: 21.4,
8.069: 38.7,
7.82: 45.4,
7.416: 33.2,
6.781: 26.5,
6.137: 19.3,
5.851: 19.5,
5.836: 19.5,
6.127: 22.7,
6.474: 19.8,
6.229: 21.4,
6.195: 21.7,
6.715: 22.8,
5.913: 18.8,
6.092: 18.7,
6.254: 18.5,
5.928: 18.3,
6.176: 21.2,
6.021: 19.2,
5.872: 20.4,
5.731: 19.3,
5.87: 22.0,
5.856: 21.1,
5.879: 18.8,
5.986: 21.4,
5.613: 15.7,
5.693: 16.2,
6.431: 24.6,
5.637: 14.3,
6.458: 19.2,
6.326: 24.4,
6.372: 23.0,
5.822: 18.4,
5.757: 15.0,
6.335: 18.1,
5.942: 17.4,
6.454: 17.1,
5.857: 13.3,
6.151: 17.8,
6.174: 14.0,
5.019: 14.4,
5.403: 13.4,
5.468: 15.6,
4.903: 11.8,
6.13: 13.8,
5.628: 15.6,
4.926: 14.6,
5.186: 17.8,
5.597: 15.4,
6.122: 22.1,
5.404: 19.3,
5.012: 15.3,
5.709: 19.4,
6.129: 17.0,
6.152: 8.7,
5.272: 13.1,
6.943: 41.3,
6.066: 24.3,
6.51: 23.3,
6.25: 27.0,
7.489: 50.0,
7.802: 50.0,
8.375: 50.0,
5.854: 10.8,
6.101: 25.0,
7.929: 50.0,
5.877: 23.8,
6.319: 23.8,
6.402: 22.3,
5.875: 50.0,
5.88: 19.1,
5.572: 23.1,
6.416: 23.6,
5.859: 22.6,
6.546: 29.4,
6.02: 23.2,
6.315: 22.3,
6.86: 29.9,
6.98: 29.8,
7.765: 39.8,
6.144: 19.8,
7.155: 37.9,
6.563: 32.5,
5.604: 26.4,
6.153: 29.6,
7.831: 50.0,
6.782: 7.5,
6.556: 29.8,
6.951: 26.7,
6.739: 30.5,
7.178: 36.4,
6.8: 31.1,
6.604: 29.1,
7.875: 50.0,
7.287: 33.3,
7.107: 30.3,
7.274: 34.6,
6.975: 34.9,
7.135: 32.9,
6.162: 13.3,
7.61: 42.3,
7.853: 48.5,
8.034: 50.0,
5.891: 22.6,
5.783: 22.5,
6.064: 24.4,
5.344: 20.0,
5.96: 21.7,
5.807: 22.4,
6.375: 28.1,
5.412: 23.7,
6.182: 25.0,
6.642: 28.7,
5.951: 21.5,
6.373: 23.0,
6.164: 21.7,
6.879: 27.5,
6.618: 30.1,
8.266: 44.8,
8.725: 50.0,
8.04: 37.6,
7.163: 31.6,
7.686: 46.7,
6.552: 31.5,
5.981: 24.3,
7.412: 31.7,
8.337: 41.7,
8.247: 48.3,
6.726: 29.0,
6.086: 24.0,
6.631: 25.1,
7.358: 31.5,
6.481: 23.7,
6.606: 23.3,
6.897: 22.0,
6.095: 20.1,
6.358: 22.2,
6.393: 23.7,
5.593: 17.6,
5.605: 18.5,
6.108: 21.9,
6.226: 20.5,
6.433: 24.5,
6.718: 26.2,
6.487: 24.4,
6.438: 24.8,
6.957: 29.6,
8.259: 42.8,
5.876: 20.9,
7.454: 44.0,
8.704: 50.0,
7.333: 36.0,
6.842: 30.1,
7.203: 33.8,
7.52: 43.1,
8.398: 48.8,
7.327: 31.0,
7.206: 36.5,
5.56: 22.8,
7.014: 30.7,
8.297: 50.0,
7.47: 43.5,
5.92: 20.7,
6.24: 25.2,
6.538: 24.4,
7.691: 35.2,
6.758: 32.4,
6.854: 32.0,
7.267: 33.2,
6.826: 33.1,
6.482: 29.1,
6.812: 35.1,
6.968: 10.4,
7.645: 46.0,
7.923: 50.0,
7.088: 32.2,
6.453: 22.0,
6.23: 20.1,
6.209: 21.4,
6.565: 24.8,
6.861: 28.5,
7.148: 37.3,
6.678: 28.6,
6.549: 27.1,
5.79: 20.3,
6.345: 22.5,
7.041: 29.0,
6.871: 24.8,
6.59: 22.0,
6.982: 33.1,
7.236: 36.1,
6.616: 28.4,
7.42: 33.4,
6.849: 28.2,
6.635: 24.5,
5.972: 20.3,
4.973: 16.1,
6.023: 19.4,
6.266: 21.6,
6.567: 23.8,
5.705: 16.2,
5.914: 17.8,
5.782: 19.8,
6.382: 23.1,
6.113: 21.0,
6.426: 23.8,
6.376: 17.7,
6.041: 20.4,
5.708: 18.5,
6.415: 25.0,
6.312: 21.2,
6.083: 22.2,
5.868: 19.3,
6.333: 22.6,
5.706: 17.1,
6.031: 19.4,
6.316: 22.2,
6.31: 20.7,
6.037: 21.1,
5.869: 19.5,
5.895: 18.5,
6.059: 20.6,
5.985: 19.0,
5.968: 18.7,
7.241: 32.7,
6.54: 16.5,
6.696: 23.9,
6.874: 31.2,
6.014: 17.5,
5.898: 17.2,
6.516: 23.1,
6.939: 26.6,
6.49: 22.9,
6.579: 24.1,
5.884: 18.6,
6.728: 14.9,
5.663: 18.2,
5.936: 13.5,
6.212: 17.8,
6.395: 21.7,
6.112: 22.6,
6.398: 25.0,
6.251: 12.6,
5.362: 20.8,
5.803: 16.8,
8.78: 21.9,
3.561: 27.5,
4.963: 21.9,
3.863: 23.1,
4.97: 50.0,
6.683: 50.0,
7.016: 50.0,
6.216: 50.0,
4.906: 13.8,
4.138: 11.9,
7.313: 15.0,
6.649: 13.9,
6.794: 22.0,
6.38: 9.5,
6.223: 10.2,
6.545: 10.9,
5.536: 11.3,
5.52: 12.3,
4.368: 8.8,
5.277: 7.2,
4.652: 10.5,
5.0: 7.4,
4.88: 10.2,
5.39: 19.7,
6.051: 23.2,
5.036: 9.7,
6.193: 11.0,
5.887: 12.7,
6.471: 13.1,
5.747: 8.5,
5.453: 5.0,
5.852: 6.3,
5.987: 5.6,
6.343: 7.2,
6.404: 12.1,
5.349: 8.3,
5.531: 8.5,
5.683: 5.0,
5.608: 27.9,
5.617: 17.2,
6.852: 27.5,
6.657: 17.2,
4.628: 17.9,
5.155: 16.3,
4.519: 7.0,
6.434: 7.2,
5.304: 12.0,
5.957: 8.8,
6.824: 8.4,
6.411: 16.7,
6.006: 14.2,
5.648: 20.8,
6.103: 13.4,
5.565: 11.7,
5.896: 8.3,
5.837: 10.2,
6.202: 10.9,
6.348: 14.5,
6.833: 14.1,
6.425: 16.1,
6.436: 14.3,
6.208: 11.7,
6.629: 13.4,
6.461: 9.6,
5.627: 12.8,
5.818: 10.5,
6.406: 17.1,
6.219: 18.4,
6.485: 15.4,
6.459: 11.8,
6.341: 14.9,
6.185: 14.6,
6.749: 13.4,
6.655: 15.2,
6.297: 16.1,
7.393: 17.8,
6.525: 14.1,
5.976: 12.7,
6.301: 14.9,
6.081: 20.0,
6.701: 16.4,
6.317: 19.5,
6.513: 20.2,
5.759: 19.9,
5.952: 19.0,
6.003: 19.1,
5.926: 24.5,
6.437: 23.2,
5.427: 13.8,
6.484: 16.7,
6.242: 23.0,
6.75: 23.7,
7.061: 25.0,
5.762: 21.8,
5.871: 20.6,
6.114: 19.1,
5.905: 20.6,
5.454: 15.2,
5.414: 7.0,
5.093: 8.1,
5.983: 20.1,
5.707: 21.8,
5.67: 23.1,
5.794: 18.3,
6.019: 21.2,
5.569: 17.5,
6.027: 16.8,
6.593: 22.4,
6.12: 20.6,
6.976: 23.9}
通过计算皮尔逊相关系数,发现房间数和房价存在正相关,接下来我们通过学习历史数据,最终实现用房间数预测房价的目标。
import heapq
nums = [35, 98, 76, 12, 55, 68, 47, 92]
print(heapq.nlargest(3, nums))
print(heapq.nsmallest(3, nums))
[98, 92, 76]
[12, 35, 47]
import heapq
# kNN算法
def predict_price_by_knn(history_data, room, k=5):
# keys = sorted(history_data, key=lambda x: (x - room) ** 2)[:k]
keys = heapq.nsmallest(k, history_data, key=lambda x: (x - room) ** 2)
return np.mean([history_data[key] for key in keys])
# 预测房价
np.round(predict_price_by_knn(history_data, 6.25), 2)
20.42
np.round(predict_price_by_knn(history_data, 5.125), 2)
13.26
# 通过散点图研究变量的关系
plt.scatter(rooms, prices)
plt.show()
通过上面的图,我们发现房间数和房价呈现出线性关系,接下来我们尝试用一个线性函数来实现对房价的预测。
回归方程:x xx 代表房间数,y yy 就是要预测的房价。
y = a x + b y = ax + by=ax+b
现在我们的问题是找到一组a
和b
,让预测达到最佳的效果(误差最小就是最佳)。
均方误差:让均方误差最小的 a aa 和 b bb 就是最佳拟合。
M S E = 1 N ∑ ( y i ^ − y i ) 2 MSE = \frac{1} {N} \sum (\hat{y_i} - y_i)^2MSE=N1∑(yi^−yi)2
def get_loss(x, y, a, b):
"""损失函数"""
y_hat = a * x + b
return np.mean((y_hat - y) ** 2)
# 通过蒙特卡罗模拟找到实现最佳拟合的a和b的值
import random
best_a, best_b = None, None
min_loss = np.inf
for _ in range(1000):
# 随机产生a和b的值
a = random.random() * 200 - 100
b = random.random() * 200 - 100
# 计算损失(MSE)
curr_loss = get_loss(rooms, prices, a, b)
# 让损失更小的a和b就是更好的拟合
if curr_loss < min_loss:
min_loss = curr_loss
best_a, best_b = a, b
print(best_a, best_b)
print(min_loss)
12.414266461017732 -56.48722240398021
50.00741553150247
损失函数是凹函数,找到使函数最小的a
和b
的值,可以用下面的方法:
a ′ = a + ( − 1 ) × ∂ l o s s ( a , b ) ∂ a × Δ a^\prime = a + (-1) \times \frac {\partial loss(a, b)} {\partial a} \times \Deltaa′=a+(−1)×∂a∂loss(a,b)×Δ
b ′ = b + ( − 1 ) × ∂ l o s s ( a , b ) ∂ b × Δ b^\prime = b + (-1) \times \frac {\partial loss(a, b)} {\partial b} \times \Deltab′=b+(−1)×∂b∂loss(a,b)×Δ
对于求MSE的损失函数来说,可以用下面的公式计算偏导数:
f ( a , b ) = 1 N ∑ i = 1 N ( y i − ( a x i + b ) ) 2 f(a, b) = \frac {1} {N} \sum_{i=1}^{N}(y_i - (ax_i + b))^2f(a,b)=N1i=1∑N(yi−(axi+b))2
∂ f ( a , b ) ∂ a = 2 N ∑ i = 1 N ( − x i y i + x i 2 a + x i b ) \frac {\partial {f(a, b)}} {\partial {a}} = \frac {2} {N} \sum_{i=1}^{N}(-x_iy_i + x_i^2a + x_ib)∂a∂f(a,b)=N2i=1∑N(−xiyi+xi2a+xib)
∂ f ( a , b ) ∂ b = 2 N ∑ i = 1 N ( − y i + x i a + b ) \frac {\partial {f(a, b)}} {\partial {b}} = \frac {2} {N} \sum_{i=1}^{N}(-y_i + x_ia + b)∂b∂f(a,b)=N2i=1∑N(−yi+xia+b)
# 求a的偏导数
def partial_a(x, y, a, b):
return 2 * np.mean((y - a * x - b) * (-x))
# 求b的偏导数
def partial_b(x, y, a, b):
return 2 * np.mean(-y + a * x + b)
# 通过梯度下降的方式向拐点逼近
# 这种方式能够更快的找到最佳拟合的a和b
# a和b的初始值可以随意设定,delta的值要足够小
a, b = 35, -35
delta = 0.01
for _ in range(100):
a = a - partial_a(rooms, prices, a, b) * delta
b = b - partial_b(rooms, prices, a, b) * delta
print(a, b)
print(get_loss(rooms, prices, a, b))
9.276809660789766 -35.781905844032686
43.61576735104159
# 通过线性回归方程预测房价
def predict_price_by_regression(a, b, x):
return a * x + b
# 预测房价
print(np.round(predict_price_by_regression(best_a, best_b, 6.25), 2))
print(np.round(predict_price_by_regression(a, b, 6.25), 2))
21.1
22.2
print(np.round(predict_price_by_regression(best_a, best_b, 5.12), 2))
print(np.round(predict_price_by_regression(a, b, 5.12), 2))
7.07
11.72
# 比较两条拟合曲线
y_hat1 = best_a * rooms + best_b
y_hat2 = a * rooms + b
plt.scatter(rooms, prices)
plt.plot(rooms, y_hat1, color='red', linewidth=4)
plt.plot(rooms, y_hat2, color='green', linewidth=4)
plt.show()
最小二乘解就是用已经得到的历史数据(x
和y
的值)找到能够最佳拟合这些历史数据的a
和b
。
y = a x + b y = ax + by=ax+b
对于上面的方程,相当于x
是变量a
的系数,1
是变量b
的系数。
lstsq
函数参数说明lstsq
函数的第一个参数是$ \begin{bmatrix} x \ 1 \ \end{bmatrix} ^T $,第二个参数就是y
,rcond
参数暂时不管,直接设置为None
。
# lstsq函数的第一个参数
param1 = np.vstack([rooms, np.ones(rooms.size)]).T
param1
array([[6.575, 1. ],
[6.421, 1. ],
[7.185, 1. ],
...,
[6.976, 1. ],
[6.794, 1. ],
[6.03 , 1. ]])
# lstsq函数的第二个参数
param2 = prices
param2
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8,
7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1,
12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9,
27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4,
8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. ,
9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8,
10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5,
23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
lstsq
函数返回值说明lstsq
函数返回的是一个四元组,四元组中的第一个元素就是要求解的方程的系数,四元组中的第二个元素是误差平方和。
# rcond参数直接设置为None(暂不解释)
result = np.linalg.lstsq(param1, param2, rcond=None)
result
(array([ 9.10210898, -34.67062078]),
array([22061.87919621]),
2,
array([143.99484122, 2.46656609]))
a, b = result[0]
mse = result[1][0] / rooms.size
print(a, b)
print(mse)
9.102108981180313 -34.67062077643857
43.600551771169584
# 比较两条拟合曲线
plt.scatter(rooms, prices)
# 梯度下降法给出的a和b预测出的房价
plt.plot(rooms, y_hat2, color='red', linewidth=4)
# lstsq函数给出的a和b预测出的房价
y_hat3 = a * rooms + b
plt.plot(rooms, y_hat3, color='green', linewidth=4)
plt.show()
版权说明 : 本文为转载文章, 版权归原作者所有 版权申明
原文链接 : https://blog.csdn.net/Lemon_Review/article/details/120316327
内容来源于网络,如有侵权,请联系作者删除!