Files
artificial_intelligence_sys…/андрюха1.ipynb
2025-04-04 13:28:56 +03:00

889 lines
37 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Лабораторная работа 1\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\Cyber\\\\Downloads\\\\daily-weather-dataset_chronological-order.xlsx'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_excel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mC:\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mUsers\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mCyber\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mDownloads\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mdaily-weather-dataset_chronological-order.xlsx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdaily\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:495\u001b[0m, in \u001b[0;36mread_excel\u001b[0;34m(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)\u001b[0m\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, ExcelFile):\n\u001b[1;32m 494\u001b[0m should_close \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 495\u001b[0m io \u001b[38;5;241m=\u001b[39m \u001b[43mExcelFile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;129;01mand\u001b[39;00m engine \u001b[38;5;241m!=\u001b[39m io\u001b[38;5;241m.\u001b[39mengine:\n\u001b[1;32m 502\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 503\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine should not be specified when passing \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man ExcelFile - ExcelFile already has the engine set\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 505\u001b[0m )\n",
"File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:1550\u001b[0m, in \u001b[0;36mExcelFile.__init__\u001b[0;34m(self, path_or_buffer, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[1;32m 1548\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxls\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1549\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1550\u001b[0m ext \u001b[38;5;241m=\u001b[39m \u001b[43minspect_excel_format\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1551\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 1552\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1553\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ext \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1554\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1555\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExcel file format cannot be determined, you must specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1556\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man engine manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1557\u001b[0m )\n",
"File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:1402\u001b[0m, in \u001b[0;36minspect_excel_format\u001b[0;34m(content_or_path, storage_options)\u001b[0m\n\u001b[1;32m 1399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(content_or_path, \u001b[38;5;28mbytes\u001b[39m):\n\u001b[1;32m 1400\u001b[0m content_or_path \u001b[38;5;241m=\u001b[39m BytesIO(content_or_path)\n\u001b[0;32m-> 1402\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1403\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m 1404\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[1;32m 1405\u001b[0m stream \u001b[38;5;241m=\u001b[39m handle\u001b[38;5;241m.\u001b[39mhandle\n\u001b[1;32m 1406\u001b[0m stream\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n",
"File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:\\\\Users\\\\Cyber\\\\Downloads\\\\daily-weather-dataset_chronological-order.xlsx'"
]
}
],
"source": [
"data = pd.read_excel(r\"C:\\Users\\Cyber\\Downloads\\daily-weather-dataset_chronological-order.xlsx\", sheet_name=\"daily\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.shape # Параметр .shape показывает размерность датафрейма"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.size # Параметр .size показывает количество элементов в датафрейме"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.count() # Метод count считает сколько всего непустых записей в каждом столбце"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.info() # Метод .info() показывает тип каждого столбца и занимаемую память"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.dtypes # Параметр .dtypes показывает просто тип каждого столбца"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.isna().head() # Метод .isna() вместо каждого значения подставит True (значение NaN) или False (действительное значение)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.isna().sum() # Подсчитаем количество пропусков в каждом столбце с помощью метода .sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"proc = data.isna().sum().sum() # Подсчитаем сколько всего пропусков (во всех столбцах) в нашем датафрейме\n",
"print(proc) # Отобразим количество посчитанных пропусков"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"proc = data.isna().sum().sum() / data.size\n",
"print(round(100*proc,1), '%', sep='')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)\n",
"# Функция для удаления выбросов по IQR\n",
"def remove_outliers(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
"\n",
"# Удаляем выбросы из указанных столбцов\n",
"columns_to_clean = [\"Cloud coverage\"]\n",
"for col in columns_to_clean:\n",
" df = remove_outliers(df, col)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(axis=1, how='all')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#3 Для прогнозирования солнечной генерации применена группировка по месяцу и времени года чтобы учитывать сезонность выработки.\n",
"df[\"Date\"] = pd.to_datetime(df[\"Date\"]) # Преобразуем столбец 'Date' в формат datetime\n",
"df[\"Month\"] = df[\"Date\"].dt.month # Добавляем столбец 'Month' для группировки по месяцам\n",
"df_monthly = df.groupby(\"Month\").mean() # Группируем по месяцу и вычисляем средние значения\n",
"df_monthly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#5\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 5.1 \n",
"# Выбираем числовые столбцы, исключая \"Month\"\n",
"numeric_features = df.select_dtypes(include=[\"number\"]).columns\n",
"numeric_features = numeric_features.drop(\"Month\") # Убираем \"Month\"\n",
"\n",
"# Строим графики для всех числовых признаков (кроме \"Month\")\n",
"for col in numeric_features:\n",
" plt.figure(figsize=(14, 4))\n",
"\n",
" # График плотности (KDE)\n",
" plt.subplot(121)\n",
" sns.kdeplot(data=df, x=col)\n",
" plt.title(f\"Распределение: {col}\")\n",
"\n",
" # Boxplot (ящик с усами)\n",
" plt.subplot(122)\n",
" sns.boxplot(data=df, x=col)\n",
" plt.title(f\"Boxplot: {col}\")\n",
"\n",
" plt.show()\n",
"\n",
"#plt.figure(figsize=(8, 5))\n",
"#sns.histplot(df['Visibility'], bins=20, kde=True)\n",
"#plt.title(\"Гистограмма распределения параметра видимости\")\n",
"#plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 5.3. Матрица корреляции\n",
"# Позволяет увидеть, как связаны между собой числовые переменные и для поиска зависимостей между погодными параметрами\n",
"# Чем ближе значение к 1 или -1, тем сильнее положительная или отрицательная корреляция\n",
"plt.figure(figsize=(8, 5))\n",
"sns.heatmap(df_monthly.corr(), annot=True, cmap='coolwarm', fmt=\".2f\")\n",
"plt.title(\"Матрица корреляции погодных параметров\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 5.4. Диаграмма рассеяния облачности и солнечной энергии\n",
"# Показывает взаимосвязь между облачностью и уровнем солнечной энергии.\n",
"plt.figure(figsize=(8, 5))\n",
"sns.scatterplot(x=df['Cloud coverage'], y=df['Solar energy'])\n",
"plt.title(\"Диаграмма рассеяния: Облачность vs Солнечная энергия\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 5.5. Среднее значение облачности по месяцам\n",
"#Позволяет проанализировать сезонные тренды облачности.\n",
"#Позволяет быстро увидеть, в какие месяцы облачность выше или ниже.\n",
"plt.figure(figsize=(8, 5))\n",
"sns.barplot(x=df['Month'], y=df['Cloud coverage'], estimator=sum)\n",
"plt.title(\"Среднее значение облачности по месяцам\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_selection import VarianceThreshold\n",
"\n",
"# Оставляем только числовые признаки\n",
"numeric_features = df.select_dtypes(include=[\"number\"])\n",
"\n",
"# Удаляем признаки с дисперсией ниже 0.01\n",
"selector = VarianceThreshold(threshold=0.01)\n",
"df_var = selector.fit_transform(numeric_features)\n",
"\n",
"# Получаем оставшиеся названия признаков\n",
"selected_features = numeric_features.columns[selector.get_support()]\n",
"df_selected = df[selected_features]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_selection import SelectKBest, f_regression\n",
"\n",
"# Убираем столбец с датами и выбираем только числовые признаки\n",
"X = df.select_dtypes(include=[\"number\"]).drop(columns=[\"Solar energy\"]) \n",
"y = df[\"Solar energy\"] # Целевая переменная\n",
"\n",
"# Отбор 5 лучших признаков\n",
"selector = SelectKBest(score_func=f_regression, k=5)\n",
"X_new = selector.fit_transform(X, y)\n",
"\n",
"# Выводим выбранные признаки\n",
"selected_features = X.columns[selector.get_support()]\n",
"print(\"Выбранные признаки:\", selected_features)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_selection import RFE\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"model = LinearRegression()\n",
"rfe = RFE(model, n_features_to_select=5)\n",
"X_rfe = rfe.fit_transform(X, y)\n",
"\n",
"selected_features = X.columns[rfe.support_]\n",
"print(\"Новые лучшие признаки:\", selected_features)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"Temp_Cloud\"] = df[\"Temperature\"] * df[\"Cloud coverage\"]\n",
"print(\"\\nDataFrame с добавленным признаком 'Temp_Cloud':\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 7\n",
"plt.figure(figsize=(8, 5))\n",
"sns.histplot(df[\"Solar energy\"], bins=30, kde=True)\n",
"plt.title(\"Распределение целевой переменной (Solar Energy)\")\n",
"plt.xlabel(\"Solar Energy\")\n",
"plt.ylabel(\"Частота\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(12, 6))\n",
"sns.boxplot(x=df[\"Month\"], y=df[\"Solar energy\"])\n",
"plt.title(\"Распределение Solar Energy по месяцам\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#8\n",
"from sklearn.model_selection import train_test_split\n",
"df = df.drop(columns=[\"Date\"])\n",
"df = df.drop(columns=[\"Month\"])\n",
"y = df[\"Solar energy\"]\n",
"# Удалим целевую переменную\n",
"X = df.drop(columns=[\"Solar energy\"])\n",
"\n",
"\n",
"# Разбиение (80% train, 20% test)\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"# Вывод размеров получившихся выборок\n",
"print(f\"Размер X_train: {X_train.shape}\")\n",
"print(f\"Размер X_test: {X_test.shape}\")\n",
"print(f\"Размер y_train: {y_train.shape}\")\n",
"print(f\"Размер y_test: {y_test.shape}\")\n",
"print(\"Обучающая выборка X:\")\n",
"print(X_train)\n",
"print(\"\\nТестовая выборка X:\")\n",
"print(X_test)\n",
"print(\"\\nОбучающая выборка y:\")\n",
"print(y_train)\n",
"print(\"\\nТестовая выборка y:\")\n",
"print(y_test)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Лабораторная работа 2\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
"\n",
"# Модели машинного обучения\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from catboost import CatBoostRegressor\n",
"from tensorflow import keras\n",
"from tensorflow.keras import layers\n",
"from tensorflow.keras.callbacks import EarlyStopping\n",
"from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold\n",
"from sklearn.neural_network import MLPRegressor\n",
"cv = KFold(n_splits=5, shuffle=True, random_state=42)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Knn\n",
"knn_params = {\n",
" 'n_neighbors': [3, 5, 7, 9],\n",
" 'weights': ['uniform', 'distance'],\n",
" 'metric': ['euclidean', 'manhattan']\n",
"}\n",
"knn_grid = GridSearchCV(KNeighborsRegressor(), knn_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"knn_grid.fit(X_train, y_train)\n",
"print(\"Best KNN:\", knn_grid.best_params_, \"Best R²:\", knn_grid.best_score_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Дерево решений\n",
"dt_params = {\n",
" 'max_depth': [3, 5, 10, 15],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'min_samples_leaf': [1, 2, 5]\n",
"}\n",
"dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"dt_grid.fit(X_train, y_train)\n",
"print(\"Best Decision Tree:\", dt_grid.best_params_, \"Best R²:\", dt_grid.best_score_)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Случайный лес\n",
"rf_params = {\n",
" 'n_estimators': [100, 200, 300],\n",
" 'max_depth': [5, 10, 15],\n",
" 'min_samples_split': [2, 5, 10]\n",
"}\n",
"rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"rf_grid.fit(X_train, y_train)\n",
"print(\"Best Random Forest:\", rf_grid.best_params_, \"Best R²:\", rf_grid.best_score_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Градиентный бустинг\n",
"cat_params = {\n",
" 'iterations': [100, 500, 1000],\n",
" 'learning_rate': [0.01, 0.05, 0.1],\n",
" 'depth': [4, 6, 8]\n",
"}\n",
"cat_grid = RandomizedSearchCV(CatBoostRegressor(verbose=0, random_state=42), cat_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
"cat_grid.fit(X_train, y_train)\n",
"print(\"Best CatBoost:\", cat_grid.best_params_, \"Best R²:\", cat_grid.best_score_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Нейронная сеть\n",
"mlp_params = {\n",
" 'hidden_layer_sizes': [(50,), (100,), (50, 50)],\n",
" 'activation': ['relu', 'tanh'],\n",
" 'alpha': [0.0001, 0.001, 0.01]\n",
"}\n",
"mlp_grid = RandomizedSearchCV(MLPRegressor(max_iter=5500, random_state=42), mlp_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
"mlp_grid.fit(X_train, y_train)\n",
"print(\"Best MLP:\", mlp_grid.best_params_, \"Best R²:\", mlp_grid.best_score_)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Сравнение моделей\n",
"models = [\"KNN\", \"Decision Tree\", \"Random Forest\", \"CatBoost\", \"Neural Network\"]\n",
"scores = [\n",
" knn_grid.best_score_,\n",
" dt_grid.best_score_,\n",
" rf_grid.best_score_,\n",
" cat_grid.best_score_,\n",
" mlp_grid.best_score_\n",
"]\n",
"\n",
"plt.figure(figsize=(10, 5))\n",
"sns.barplot(x=models, y=scores)\n",
"plt.ylabel(\"R2 Score\")\n",
"plt.title(\"Сравнение моделей машинного обучения\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Лабораторная работа №3 - Оценка моделей\n",
"knn_best = knn_grid.best_estimator_\n",
"dt_best = dt_grid.best_estimator_\n",
"rf_best = rf_grid.best_estimator_\n",
"cat_best = cat_grid.best_estimator_\n",
"mlp_best = mlp_grid.best_estimator_\n",
"def evaluate_model(model, X_test, y_test):\n",
" y_pred = model.predict(X_test)\n",
" mae = mean_absolute_error(y_test, y_pred)\n",
" mse = mean_squared_error(y_test, y_pred)\n",
" rmse = np.sqrt(mse)\n",
" mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
" r2 = r2_score(y_test, y_pred)\n",
" return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}\n",
"\n",
"models = {'KNN': knn_best, 'Decision Tree': dt_best, 'Random Forest': rf_best, 'CatBoost': cat_best, 'MLP': mlp_best}\n",
"\n",
"for name, model in models.items():\n",
" results = evaluate_model(model, X_test, y_test)\n",
" print(f\"{name} Evaluation: {results}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Визуализация результатов\n",
"# Словарь для хранения результатов\n",
"metrics = {'Model': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'MAPE': [], 'R2': []}\n",
"\n",
"# Оценка каждой модели\n",
"for name, model in models.items():\n",
" results = evaluate_model(model, X_test, y_test)\n",
" metrics['Model'].append(name)\n",
" for key in results:\n",
" metrics[key].append(results[key])\n",
"\n",
"# Преобразование в DataFrame\n",
"metrics_df = pd.DataFrame(metrics)\n",
"metrics_df.set_index('Model', inplace=True)\n",
"\n",
"# Названия метрик и их описания для графиков\n",
"metric_labels = {\n",
" 'MAE': 'Средняя абсолютная ошибка (MAE)',\n",
" 'MSE': 'Среднеквадратическая ошибка (MSE)',\n",
" 'RMSE': 'Корень из MSE (RMSE)',\n",
" 'MAPE': 'Средняя абсолютная процентная ошибка (MAPE)',\n",
" 'R2': 'Коэффициент детерминации (R²)'\n",
"}\n",
"\n",
"# Отображение каждого графика отдельно\n",
"for metric in metrics_df.columns:\n",
" plt.figure(figsize=(8, 5))\n",
" sns.barplot(\n",
" x=metrics_df.index, \n",
" y=metrics_df[metric], \n",
" hue=metrics_df.index, # Добавляем hue\n",
" palette='viridis', \n",
" edgecolor='black',\n",
" legend=False # Отключаем легенду, так как цвета соответствуют x\n",
" )\n",
" plt.title(f'Сравнение моделей по {metric_labels[metric]}', fontsize=14)\n",
" plt.xlabel(\"Модель\", fontsize=12)\n",
" plt.ylabel(metric_labels[metric], fontsize=12)\n",
" plt.xticks(rotation=45)\n",
" plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
" plt.show()\n",
"\n",
"\n",
"# Визуализация фактических vs предсказанных значений для лучшей модели (по R²)\n",
"best_model_name = metrics_df.sort_values(by='R2', ascending=False).index[0]\n",
"best_model = models[best_model_name]\n",
"\n",
"y_pred_best = best_model.predict(X_test)\n",
"\n",
"plt.figure(figsize=(8, 8))\n",
"sns.scatterplot(x=y_test, y=y_pred_best, alpha=0.6)\n",
"plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--r', label=\"Идеальное предсказание\")\n",
"plt.xlabel(\"Фактические значения\", fontsize=12)\n",
"plt.ylabel(\"Предсказанные значения\", fontsize=12)\n",
"plt.title(f\"Фактические vs. Предсказанные ({best_model_name})\", fontsize=14)\n",
"plt.legend()\n",
"plt.grid(True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Лабораторная работа №4\n",
"df = df.drop(columns=[\"Altimeter\"])\n",
"df = df.drop(columns=[\"Temp_Cloud\"])\n",
"y = df[\"Solar energy\"]\n",
"X = df.drop(columns=[\"Solar energy\"])\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Knn\n",
"\n",
"knn_params = {\n",
"\n",
" 'n_neighbors': [3, 5, 7, 9],\n",
"\n",
" 'weights': ['uniform', 'distance'],\n",
"\n",
" 'metric': ['euclidean', 'manhattan']\n",
"\n",
"}\n",
"\n",
"knn_grid = GridSearchCV(KNeighborsRegressor(), knn_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"\n",
"knn_grid.fit(X_train, y_train)\n",
"\n",
"print(\"Best KNN:\", knn_grid.best_params_, \"Best R²:\", knn_grid.best_score_)\n",
"\n",
"# Дерево решений\n",
"\n",
"dt_params = {\n",
"\n",
" 'max_depth': [3, 5, 10, 15],\n",
"\n",
" 'min_samples_split': [2, 5, 10],\n",
"\n",
" 'min_samples_leaf': [1, 2, 5]\n",
"\n",
"}\n",
"\n",
"dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"\n",
"dt_grid.fit(X_train, y_train)\n",
"\n",
"print(\"Best Decision Tree:\", dt_grid.best_params_, \"Best R²:\", dt_grid.best_score_)\n",
"\n",
"# Случайный лес\n",
"\n",
"rf_params = {\n",
"\n",
" 'n_estimators': [100, 200, 300],\n",
"\n",
" 'max_depth': [5, 10, 15],\n",
"\n",
" 'min_samples_split': [2, 5, 10]\n",
"\n",
"}\n",
"\n",
"rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=cv, scoring='r2', n_jobs=-1)\n",
"\n",
"rf_grid.fit(X_train, y_train)\n",
"\n",
"print(\"Best Random Forest:\", rf_grid.best_params_, \"Best R²:\", rf_grid.best_score_)\n",
"\n",
"# Градиентный бустинг\n",
"\n",
"cat_params = {\n",
"\n",
" 'iterations': [100, 500, 1000],\n",
"\n",
" 'learning_rate': [0.01, 0.05, 0.1],\n",
"\n",
" 'depth': [4, 6, 8]\n",
"\n",
"}\n",
"\n",
"cat_grid = RandomizedSearchCV(CatBoostRegressor(verbose=0, random_state=42), cat_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
"\n",
"cat_grid.fit(X_train, y_train)\n",
"\n",
"print(\"Best CatBoost:\", cat_grid.best_params_, \"Best R²:\", cat_grid.best_score_)\n",
"\n",
"# Нейронная сеть\n",
"\n",
"mlp_params = {\n",
"\n",
" 'hidden_layer_sizes': [(50,), (100,), (50, 50)],\n",
"\n",
" 'activation': ['relu', 'tanh'],\n",
"\n",
" 'alpha': [0.0001, 0.001, 0.01]\n",
"\n",
"}\n",
"\n",
"mlp_grid = RandomizedSearchCV(MLPRegressor(max_iter=5500, random_state=42), mlp_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
"\n",
"mlp_grid.fit(X_train, y_train)\n",
"\n",
"print(\"Best MLP:\", mlp_grid.best_params_, \"Best R²:\", mlp_grid.best_score_)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"knn_best = knn_grid.best_estimator_\n",
"dt_best = dt_grid.best_estimator_\n",
"rf_best = rf_grid.best_estimator_\n",
"cat_best = cat_grid.best_estimator_\n",
"mlp_best = mlp_grid.best_estimator_\n",
"def evaluate_model(model, X_test, y_test):\n",
" y_pred = model.predict(X_test)\n",
" mae = mean_absolute_error(y_test, y_pred)\n",
" mse = mean_squared_error(y_test, y_pred)\n",
" rmse = np.sqrt(mse)\n",
" mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
" r2 = r2_score(y_test, y_pred)\n",
" return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}\n",
"\n",
"models = {'KNN': knn_best, 'Decision Tree': dt_best, 'Random Forest': rf_best, 'CatBoost': cat_best, 'MLP': mlp_best}\n",
"\n",
"for name, model in models.items():\n",
" results = evaluate_model(model, X_test, y_test)\n",
" print(f\"{name} Evaluation: {results}\")\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Список моделей и их названий\n",
"models = {\n",
" \"KNN\": knn_best,\n",
" \"Decision Tree\": dt_best,\n",
" \"Random Forest\": rf_best,\n",
" \"CatBoost\": cat_best,\n",
" \"MLP\": mlp_best\n",
"}\n",
"\n",
"# Создаем DataFrame для предсказанных значений\n",
"predictions_df = pd.DataFrame({\"Actual\": y_test})\n",
"\n",
"# Генерируем предсказания для каждой модели\n",
"for name, model in models.items():\n",
" predictions_df[name] = model.predict(X_test)\n",
"\n",
"# Выводим первые 10 строк предсказаний\n",
"print(predictions_df.head(10))\n",
"\n",
"# Визуализация предсказаний\n",
"plt.figure(figsize=(12, 6))\n",
"plt.plot(predictions_df[\"Actual\"].values, label=\"Actual\", color=\"black\", linewidth=2)\n",
"\n",
"for name in models.keys():\n",
" plt.plot(predictions_df[name].values, label=name, linestyle=\"--\")\n",
"\n",
"plt.legend()\n",
"plt.title(\"Actual vs Predicted Values\")\n",
"plt.xlabel(\"Samples\")\n",
"plt.ylabel(\"Solar Energy Output\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"# Список моделей и их названий\n",
"models = {\n",
" \"KNN\": knn_best,\n",
" \"Decision_Tree\": dt_best,\n",
" \"Random_Forest\": rf_best,\n",
" \"CatBoost\": cat_best,\n",
" \"MLP\": mlp_best\n",
"}\n",
"\n",
"# Сохраняем каждую модель в файл .pkl\n",
"for name, model in models.items():\n",
" with open(f\"{name}.pkl\", \"wb\") as file:\n",
" pickle.dump(model, file)\n",
"\n",
"print(\"Все модели сохранены в формате .pkl!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}