﻿## 14.5. Analiza przypadku: wielokrotna regresja liniowa na zbiorze „California Housing”
#### 14.5.1.1. Wczytanie zbioru danych
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()

#### 14.5.1.2. Informacja o zbiorze
print(california.DESCR)
california.data.shape
california.target.shape
california.feature_names

### 14.5.2. Eksploracja danych
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('max_columns', 9)
pd.set_option('display.width', None)
california_df = pd.DataFrame(california.data,
              columns=california.feature_names)

california_df['MedHouseValue'] = pd.Series(california.target)
california_df.head()
california_df.describe()

### 14.5.3. Wizualizacja cech
sample_df = california_df.sample(frac=0.1, random_state=17)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=2)
sns.set_style('whitegrid')

for feature in california.feature_names:
    plt.figure(figsize=(16, 9))
    sns.scatterplot(data=sample_df, x=feature,
    y='MedHouseValue', hue='MedHouseValue',
    palette='cool', legend=False)

### 14.5.4. Podział próbek na treningowe i testowe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
      california.data, california.target, random_state=11)

X_train.shape
X_test.shape

### 14.5.5. Trenowanie modelu
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(X=X_train, y=y_train)

for i, name in enumerate(california.feature_names):
    print(f'{name:>10}: {linear_regression.coef_[i]}')

linear_regression.intercept_

### 14.5.6. Testowanie modelu
prognoza = linear_regression.predict(X_test)
oczekiwane = y_test
prognoza[:5]
oczekiwane[:5]

### 14.5.7. Wizualna konfrontacja prognozy z oczekiwaniami
df = pd.DataFrame()
df['Oczekiwane'] = pd.Series(oczekiwane)
df['Prognoza'] = pd.Series(prognoza)
figure = plt.figure(figsize=(9, 9))

axes = sns.scatterplot(data=df, x='Oczekiwane', y='Prognoza',
               hue='Prognoza', palette='cool', legend=False)   

start = min(oczekiwane.min(), prognoza.min())
end = max(oczekiwane.max(), prognoza.max())
axes.set_xlim(start, end)
axes.set_ylim(start, end)

line = plt.plot([start, end], [start, end], 'k--')
 
### 14.5.8. Metryka w modelach regresyjnych
from sklearn import metrics
metrics.r2_score(oczekiwane, prognoza)
metrics.mean_squared_error(oczekiwane, prognoza)
--------------------------------
https://scikit-learn.org/stable/modules/model_evaluation.html

### 14.5.9. Wybór najlepszego modelu
https://scikit-learn.org/stable/modules/linear_model.html
--------------------------------
from sklearn.linear_model import ElasticNet, Lasso, Ridge

estimators = {
    'LinearRegression': linear_regression,
    'ElasticNet': ElasticNet(),
    'Lasso': Lasso(),
    'Ridge': Ridge()
             }      

from sklearn.model_selection import KFold, cross_val_score

for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=11, shuffle=True)
    scores = cross_val_score(estimator=estimator_object,
        X=california.data, y=california.target, cv=kfold,
        scoring='r2')
    print(f'{estimator_name:>16}: ' +
          f'średnia punktacja r2 = {scores.mean():.3f}')
