artificial_intelligence_sys…/андрюха1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Лабораторная работа 1\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\Cyber\\\\Downloads\\\\daily-weather-dataset_chronological-order.xlsx'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_excel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mC:\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mUsers\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mCyber\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mDownloads\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mdaily-weather-dataset_chronological-order.xlsx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdaily\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:495\u001b[0m, in \u001b[0;36mread_excel\u001b[0;34m(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)\u001b[0m\n\u001b[1;32m    493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(io, ExcelFile):\n\u001b[1;32m    494\u001b[0m     should_close \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 495\u001b[0m     io \u001b[38;5;241m=\u001b[39m \u001b[43mExcelFile\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m        \u001b[49m\u001b[43mio\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[43m        \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    499\u001b[0m \u001b[43m        \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    501\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m engine \u001b[38;5;129;01mand\u001b[39;00m engine \u001b[38;5;241m!=\u001b[39m io\u001b[38;5;241m.\u001b[39mengine:\n\u001b[1;32m    502\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    503\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEngine should not be specified when passing \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    504\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man ExcelFile - ExcelFile already has the engine set\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    505\u001b[0m     )\n",
      "File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:1550\u001b[0m, in \u001b[0;36mExcelFile.__init__\u001b[0;34m(self, path_or_buffer, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[1;32m   1548\u001b[0m     ext \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxls\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1549\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1550\u001b[0m     ext \u001b[38;5;241m=\u001b[39m \u001b[43minspect_excel_format\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1551\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m   1552\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1553\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ext \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1554\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m   1555\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExcel file format cannot be determined, you must specify \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1556\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124man engine manually.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1557\u001b[0m         )\n",
      "File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/excel/_base.py:1402\u001b[0m, in \u001b[0;36minspect_excel_format\u001b[0;34m(content_or_path, storage_options)\u001b[0m\n\u001b[1;32m   1399\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(content_or_path, \u001b[38;5;28mbytes\u001b[39m):\n\u001b[1;32m   1400\u001b[0m     content_or_path \u001b[38;5;241m=\u001b[39m BytesIO(content_or_path)\n\u001b[0;32m-> 1402\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1403\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcontent_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[1;32m   1404\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m handle:\n\u001b[1;32m   1405\u001b[0m     stream \u001b[38;5;241m=\u001b[39m handle\u001b[38;5;241m.\u001b[39mhandle\n\u001b[1;32m   1406\u001b[0m     stream\u001b[38;5;241m.\u001b[39mseek(\u001b[38;5;241m0\u001b[39m)\n",
      "File \u001b[0;32m~/Nextcloud/#Учёба/институт/#4 Курс/Системы искусственного интеллекта/Jupyter/venv/lib/python3.13/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m    874\u001b[0m             handle,\n\u001b[1;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m    879\u001b[0m         )\n\u001b[1;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    883\u001b[0m     handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m    885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:\\\\Users\\\\Cyber\\\\Downloads\\\\daily-weather-dataset_chronological-order.xlsx'"
     ]
    }
   ],
   "source": [
    "data = pd.read_excel(r\"C:\\Users\\Cyber\\Downloads\\daily-weather-dataset_chronological-order.xlsx\", sheet_name=\"daily\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.shape # Параметр .shape показывает размерность датафрейма"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.size  # Параметр .size показывает количество элементов в датафрейме"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.count() # Метод count считает сколько всего непустых записей в каждом столбце"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.info() # Метод .info() показывает тип каждого столбца и занимаемую память"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.dtypes # Параметр .dtypes показывает просто тип каждого столбца"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.isna().head() # Метод .isna() вместо каждого значения подставит True (значение NaN) или False (действительное значение)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.isna().sum() # Подсчитаем количество пропусков в каждом столбце с помощью метода .sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "proc = data.isna().sum().sum() # Подсчитаем сколько всего пропусков (во всех столбцах) в нашем датафрейме\n",
    "print(proc) # Отобразим количество посчитанных пропусков"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "proc = data.isna().sum().sum() / data.size\n",
    "print(round(100*proc,1), '%', sep='')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(data)\n",
    "# Функция для удаления выбросов по IQR\n",
    "def remove_outliers(df, column):\n",
    "    Q1 = df[column].quantile(0.25)\n",
    "    Q3 = df[column].quantile(0.75)\n",
    "    IQR = Q3 - Q1\n",
    "    lower_bound = Q1 - 1.5 * IQR\n",
    "    upper_bound = Q3 + 1.5 * IQR\n",
    "    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
    "\n",
    "# Удаляем выбросы из указанных столбцов\n",
    "columns_to_clean = [\"Cloud coverage\"]\n",
    "for col in columns_to_clean:\n",
    "    df = remove_outliers(df, col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(axis=1, how='all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#3 Для прогнозирования солнечной генерации применена группировка по месяцу и времени года – чтобы учитывать сезонность выработки.\n",
    "df[\"Date\"] = pd.to_datetime(df[\"Date\"]) # Преобразуем столбец 'Date' в формат datetime\n",
    "df[\"Month\"] = df[\"Date\"].dt.month # Добавляем столбец 'Month' для группировки по месяцам\n",
    "df_monthly = df.groupby(\"Month\").mean() # Группируем по месяцу и вычисляем средние значения\n",
    "df_monthly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#5\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.1 \n",
    "# Выбираем числовые столбцы, исключая \"Month\"\n",
    "numeric_features = df.select_dtypes(include=[\"number\"]).columns\n",
    "numeric_features = numeric_features.drop(\"Month\")  # Убираем \"Month\"\n",
    "\n",
    "# Строим графики для всех числовых признаков (кроме \"Month\")\n",
    "for col in numeric_features:\n",
    "    plt.figure(figsize=(14, 4))\n",
    "\n",
    "    # График плотности (KDE)\n",
    "    plt.subplot(121)\n",
    "    sns.kdeplot(data=df, x=col)\n",
    "    plt.title(f\"Распределение: {col}\")\n",
    "\n",
    "    # Boxplot (ящик с усами)\n",
    "    plt.subplot(122)\n",
    "    sns.boxplot(data=df, x=col)\n",
    "    plt.title(f\"Boxplot: {col}\")\n",
    "\n",
    "    plt.show()\n",
    "\n",
    "#plt.figure(figsize=(8, 5))\n",
    "#sns.histplot(df['Visibility'], bins=20, kde=True)\n",
    "#plt.title(\"Гистограмма распределения параметра видимости\")\n",
    "#plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.3. Матрица корреляции\n",
    "# Позволяет увидеть, как связаны между собой числовые переменные и для поиска зависимостей между погодными параметрами\n",
    "# Чем ближе значение к 1 или -1, тем сильнее положительная или отрицательная корреляция\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.heatmap(df_monthly.corr(), annot=True, cmap='coolwarm', fmt=\".2f\")\n",
    "plt.title(\"Матрица корреляции погодных параметров\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.4. Диаграмма рассеяния облачности и солнечной энергии\n",
    "# Показывает взаимосвязь между облачностью и уровнем солнечной энергии.\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.scatterplot(x=df['Cloud coverage'], y=df['Solar energy'])\n",
    "plt.title(\"Диаграмма рассеяния: Облачность vs Солнечная энергия\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5.5. Среднее значение облачности по месяцам\n",
    "#Позволяет проанализировать сезонные тренды облачности.\n",
    "#Позволяет быстро увидеть, в какие месяцы облачность выше или ниже.\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.barplot(x=df['Month'], y=df['Cloud coverage'], estimator=sum)\n",
    "plt.title(\"Среднее значение облачности по месяцам\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import VarianceThreshold\n",
    "\n",
    "# Оставляем только числовые признаки\n",
    "numeric_features = df.select_dtypes(include=[\"number\"])\n",
    "\n",
    "# Удаляем признаки с дисперсией ниже 0.01\n",
    "selector = VarianceThreshold(threshold=0.01)\n",
    "df_var = selector.fit_transform(numeric_features)\n",
    "\n",
    "# Получаем оставшиеся названия признаков\n",
    "selected_features = numeric_features.columns[selector.get_support()]\n",
    "df_selected = df[selected_features]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectKBest, f_regression\n",
    "\n",
    "# Убираем столбец с датами и выбираем только числовые признаки\n",
    "X = df.select_dtypes(include=[\"number\"]).drop(columns=[\"Solar energy\"])  \n",
    "y = df[\"Solar energy\"]  # Целевая переменная\n",
    "\n",
    "# Отбор 5 лучших признаков\n",
    "selector = SelectKBest(score_func=f_regression, k=5)\n",
    "X_new = selector.fit_transform(X, y)\n",
    "\n",
    "# Выводим выбранные признаки\n",
    "selected_features = X.columns[selector.get_support()]\n",
    "print(\"Выбранные признаки:\", selected_features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import RFE\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "model = LinearRegression()\n",
    "rfe = RFE(model, n_features_to_select=5)\n",
    "X_rfe = rfe.fit_transform(X, y)\n",
    "\n",
    "selected_features = X.columns[rfe.support_]\n",
    "print(\"Новые лучшие признаки:\", selected_features)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"Temp_Cloud\"] = df[\"Temperature\"] * df[\"Cloud coverage\"]\n",
    "print(\"\\nDataFrame с добавленным признаком 'Temp_Cloud':\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 7\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.histplot(df[\"Solar energy\"], bins=30, kde=True)\n",
    "plt.title(\"Распределение целевой переменной (Solar Energy)\")\n",
    "plt.xlabel(\"Solar Energy\")\n",
    "plt.ylabel(\"Частота\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(12, 6))\n",
    "sns.boxplot(x=df[\"Month\"], y=df[\"Solar energy\"])\n",
    "plt.title(\"Распределение Solar Energy по месяцам\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#8\n",
    "from sklearn.model_selection import train_test_split\n",
    "df = df.drop(columns=[\"Date\"])\n",
    "df = df.drop(columns=[\"Month\"])\n",
    "y = df[\"Solar energy\"]\n",
    "# Удалим целевую переменную\n",
    "X = df.drop(columns=[\"Solar energy\"])\n",
    "\n",
    "\n",
    "# Разбиение (80% train, 20% test)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "# Вывод размеров получившихся выборок\n",
    "print(f\"Размер X_train: {X_train.shape}\")\n",
    "print(f\"Размер X_test: {X_test.shape}\")\n",
    "print(f\"Размер y_train: {y_train.shape}\")\n",
    "print(f\"Размер y_test: {y_test.shape}\")\n",
    "print(\"Обучающая выборка X:\")\n",
    "print(X_train)\n",
    "print(\"\\nТестовая выборка X:\")\n",
    "print(X_test)\n",
    "print(\"\\nОбучающая выборка y:\")\n",
    "print(y_train)\n",
    "print(\"\\nТестовая выборка y:\")\n",
    "print(y_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Лабораторная работа 2\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n",
    "\n",
    "# Модели машинного обучения\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from catboost import CatBoostRegressor\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import layers\n",
    "from tensorflow.keras.callbacks import EarlyStopping\n",
    "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "cv = KFold(n_splits=5, shuffle=True, random_state=42)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Knn\n",
    "knn_params = {\n",
    "    'n_neighbors': [3, 5, 7, 9],\n",
    "    'weights': ['uniform', 'distance'],\n",
    "    'metric': ['euclidean', 'manhattan']\n",
    "}\n",
    "knn_grid = GridSearchCV(KNeighborsRegressor(), knn_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "knn_grid.fit(X_train, y_train)\n",
    "print(\"Best KNN:\", knn_grid.best_params_, \"Best R²:\", knn_grid.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Дерево решений\n",
    "dt_params = {\n",
    "    'max_depth': [3, 5, 10, 15],\n",
    "    'min_samples_split': [2, 5, 10],\n",
    "    'min_samples_leaf': [1, 2, 5]\n",
    "}\n",
    "dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "dt_grid.fit(X_train, y_train)\n",
    "print(\"Best Decision Tree:\", dt_grid.best_params_, \"Best R²:\", dt_grid.best_score_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Случайный лес\n",
    "rf_params = {\n",
    "    'n_estimators': [100, 200, 300],\n",
    "    'max_depth': [5, 10, 15],\n",
    "    'min_samples_split': [2, 5, 10]\n",
    "}\n",
    "rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "rf_grid.fit(X_train, y_train)\n",
    "print(\"Best Random Forest:\", rf_grid.best_params_, \"Best R²:\", rf_grid.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Градиентный бустинг\n",
    "cat_params = {\n",
    "    'iterations': [100, 500, 1000],\n",
    "    'learning_rate': [0.01, 0.05, 0.1],\n",
    "    'depth': [4, 6, 8]\n",
    "}\n",
    "cat_grid = RandomizedSearchCV(CatBoostRegressor(verbose=0, random_state=42), cat_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
    "cat_grid.fit(X_train, y_train)\n",
    "print(\"Best CatBoost:\", cat_grid.best_params_, \"Best R²:\", cat_grid.best_score_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Нейронная сеть\n",
    "mlp_params = {\n",
    "    'hidden_layer_sizes': [(50,), (100,), (50, 50)],\n",
    "    'activation': ['relu', 'tanh'],\n",
    "    'alpha': [0.0001, 0.001, 0.01]\n",
    "}\n",
    "mlp_grid = RandomizedSearchCV(MLPRegressor(max_iter=5500, random_state=42), mlp_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
    "mlp_grid.fit(X_train, y_train)\n",
    "print(\"Best MLP:\", mlp_grid.best_params_, \"Best R²:\", mlp_grid.best_score_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Сравнение моделей\n",
    "models = [\"KNN\", \"Decision Tree\", \"Random Forest\", \"CatBoost\", \"Neural Network\"]\n",
    "scores = [\n",
    "    knn_grid.best_score_,\n",
    "    dt_grid.best_score_,\n",
    "    rf_grid.best_score_,\n",
    "    cat_grid.best_score_,\n",
    "    mlp_grid.best_score_\n",
    "]\n",
    "\n",
    "plt.figure(figsize=(10, 5))\n",
    "sns.barplot(x=models, y=scores)\n",
    "plt.ylabel(\"R2 Score\")\n",
    "plt.title(\"Сравнение моделей машинного обучения\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Лабораторная работа №3 - Оценка моделей\n",
    "knn_best = knn_grid.best_estimator_\n",
    "dt_best = dt_grid.best_estimator_\n",
    "rf_best = rf_grid.best_estimator_\n",
    "cat_best = cat_grid.best_estimator_\n",
    "mlp_best = mlp_grid.best_estimator_\n",
    "def evaluate_model(model, X_test, y_test):\n",
    "    y_pred = model.predict(X_test)\n",
    "    mae = mean_absolute_error(y_test, y_pred)\n",
    "    mse = mean_squared_error(y_test, y_pred)\n",
    "    rmse = np.sqrt(mse)\n",
    "    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
    "    r2 = r2_score(y_test, y_pred)\n",
    "    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}\n",
    "\n",
    "models = {'KNN': knn_best, 'Decision Tree': dt_best, 'Random Forest': rf_best, 'CatBoost': cat_best, 'MLP': mlp_best}\n",
    "\n",
    "for name, model in models.items():\n",
    "    results = evaluate_model(model, X_test, y_test)\n",
    "    print(f\"{name} Evaluation: {results}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Визуализация результатов\n",
    "# Словарь для хранения результатов\n",
    "metrics = {'Model': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'MAPE': [], 'R2': []}\n",
    "\n",
    "# Оценка каждой модели\n",
    "for name, model in models.items():\n",
    "    results = evaluate_model(model, X_test, y_test)\n",
    "    metrics['Model'].append(name)\n",
    "    for key in results:\n",
    "        metrics[key].append(results[key])\n",
    "\n",
    "# Преобразование в DataFrame\n",
    "metrics_df = pd.DataFrame(metrics)\n",
    "metrics_df.set_index('Model', inplace=True)\n",
    "\n",
    "# Названия метрик и их описания для графиков\n",
    "metric_labels = {\n",
    "    'MAE': 'Средняя абсолютная ошибка (MAE)',\n",
    "    'MSE': 'Среднеквадратическая ошибка (MSE)',\n",
    "    'RMSE': 'Корень из MSE (RMSE)',\n",
    "    'MAPE': 'Средняя абсолютная процентная ошибка (MAPE)',\n",
    "    'R2': 'Коэффициент детерминации (R²)'\n",
    "}\n",
    "\n",
    "# Отображение каждого графика отдельно\n",
    "for metric in metrics_df.columns:\n",
    "    plt.figure(figsize=(8, 5))\n",
    "    sns.barplot(\n",
    "        x=metrics_df.index, \n",
    "        y=metrics_df[metric], \n",
    "        hue=metrics_df.index,  # Добавляем hue\n",
    "        palette='viridis', \n",
    "        edgecolor='black',\n",
    "        legend=False  # Отключаем легенду, так как цвета соответствуют x\n",
    "    )\n",
    "    plt.title(f'Сравнение моделей по {metric_labels[metric]}', fontsize=14)\n",
    "    plt.xlabel(\"Модель\", fontsize=12)\n",
    "    plt.ylabel(metric_labels[metric], fontsize=12)\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "# Визуализация фактических vs предсказанных значений для лучшей модели (по R²)\n",
    "best_model_name = metrics_df.sort_values(by='R2', ascending=False).index[0]\n",
    "best_model = models[best_model_name]\n",
    "\n",
    "y_pred_best = best_model.predict(X_test)\n",
    "\n",
    "plt.figure(figsize=(8, 8))\n",
    "sns.scatterplot(x=y_test, y=y_pred_best, alpha=0.6)\n",
    "plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--r', label=\"Идеальное предсказание\")\n",
    "plt.xlabel(\"Фактические значения\", fontsize=12)\n",
    "plt.ylabel(\"Предсказанные значения\", fontsize=12)\n",
    "plt.title(f\"Фактические vs. Предсказанные ({best_model_name})\", fontsize=14)\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Лабораторная работа №4\n",
    "df = df.drop(columns=[\"Altimeter\"])\n",
    "df = df.drop(columns=[\"Temp_Cloud\"])\n",
    "y = df[\"Solar energy\"]\n",
    "X = df.drop(columns=[\"Solar energy\"])\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "scaler = StandardScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Knn\n",
    "\n",
    "knn_params = {\n",
    "\n",
    "    'n_neighbors': [3, 5, 7, 9],\n",
    "\n",
    "    'weights': ['uniform', 'distance'],\n",
    "\n",
    "    'metric': ['euclidean', 'manhattan']\n",
    "\n",
    "}\n",
    "\n",
    "knn_grid = GridSearchCV(KNeighborsRegressor(), knn_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "\n",
    "knn_grid.fit(X_train, y_train)\n",
    "\n",
    "print(\"Best KNN:\", knn_grid.best_params_, \"Best R²:\", knn_grid.best_score_)\n",
    "\n",
    "# Дерево решений\n",
    "\n",
    "dt_params = {\n",
    "\n",
    "    'max_depth': [3, 5, 10, 15],\n",
    "\n",
    "    'min_samples_split': [2, 5, 10],\n",
    "\n",
    "    'min_samples_leaf': [1, 2, 5]\n",
    "\n",
    "}\n",
    "\n",
    "dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "\n",
    "dt_grid.fit(X_train, y_train)\n",
    "\n",
    "print(\"Best Decision Tree:\", dt_grid.best_params_, \"Best R²:\", dt_grid.best_score_)\n",
    "\n",
    "# Случайный лес\n",
    "\n",
    "rf_params = {\n",
    "\n",
    "    'n_estimators': [100, 200, 300],\n",
    "\n",
    "    'max_depth': [5, 10, 15],\n",
    "\n",
    "    'min_samples_split': [2, 5, 10]\n",
    "\n",
    "}\n",
    "\n",
    "rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=cv, scoring='r2', n_jobs=-1)\n",
    "\n",
    "rf_grid.fit(X_train, y_train)\n",
    "\n",
    "print(\"Best Random Forest:\", rf_grid.best_params_, \"Best R²:\", rf_grid.best_score_)\n",
    "\n",
    "# Градиентный бустинг\n",
    "\n",
    "cat_params = {\n",
    "\n",
    "    'iterations': [100, 500, 1000],\n",
    "\n",
    "    'learning_rate': [0.01, 0.05, 0.1],\n",
    "\n",
    "    'depth': [4, 6, 8]\n",
    "\n",
    "}\n",
    "\n",
    "cat_grid = RandomizedSearchCV(CatBoostRegressor(verbose=0, random_state=42), cat_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
    "\n",
    "cat_grid.fit(X_train, y_train)\n",
    "\n",
    "print(\"Best CatBoost:\", cat_grid.best_params_, \"Best R²:\", cat_grid.best_score_)\n",
    "\n",
    "# Нейронная сеть\n",
    "\n",
    "mlp_params = {\n",
    "\n",
    "    'hidden_layer_sizes': [(50,), (100,), (50, 50)],\n",
    "\n",
    "    'activation': ['relu', 'tanh'],\n",
    "\n",
    "    'alpha': [0.0001, 0.001, 0.01]\n",
    "\n",
    "}\n",
    "\n",
    "mlp_grid = RandomizedSearchCV(MLPRegressor(max_iter=5500, random_state=42), mlp_params, cv=cv, scoring='r2', n_jobs=-1, n_iter=10)\n",
    "\n",
    "mlp_grid.fit(X_train, y_train)\n",
    "\n",
    "print(\"Best MLP:\", mlp_grid.best_params_, \"Best R²:\", mlp_grid.best_score_)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "knn_best = knn_grid.best_estimator_\n",
    "dt_best = dt_grid.best_estimator_\n",
    "rf_best = rf_grid.best_estimator_\n",
    "cat_best = cat_grid.best_estimator_\n",
    "mlp_best = mlp_grid.best_estimator_\n",
    "def evaluate_model(model, X_test, y_test):\n",
    "    y_pred = model.predict(X_test)\n",
    "    mae = mean_absolute_error(y_test, y_pred)\n",
    "    mse = mean_squared_error(y_test, y_pred)\n",
    "    rmse = np.sqrt(mse)\n",
    "    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100\n",
    "    r2 = r2_score(y_test, y_pred)\n",
    "    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape, 'R2': r2}\n",
    "\n",
    "models = {'KNN': knn_best, 'Decision Tree': dt_best, 'Random Forest': rf_best, 'CatBoost': cat_best, 'MLP': mlp_best}\n",
    "\n",
    "for name, model in models.items():\n",
    "    results = evaluate_model(model, X_test, y_test)\n",
    "    print(f\"{name} Evaluation: {results}\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Список моделей и их названий\n",
    "models = {\n",
    "    \"KNN\": knn_best,\n",
    "    \"Decision Tree\": dt_best,\n",
    "    \"Random Forest\": rf_best,\n",
    "    \"CatBoost\": cat_best,\n",
    "    \"MLP\": mlp_best\n",
    "}\n",
    "\n",
    "# Создаем DataFrame для предсказанных значений\n",
    "predictions_df = pd.DataFrame({\"Actual\": y_test})\n",
    "\n",
    "# Генерируем предсказания для каждой модели\n",
    "for name, model in models.items():\n",
    "    predictions_df[name] = model.predict(X_test)\n",
    "\n",
    "# Выводим первые 10 строк предсказаний\n",
    "print(predictions_df.head(10))\n",
    "\n",
    "# Визуализация предсказаний\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(predictions_df[\"Actual\"].values, label=\"Actual\", color=\"black\", linewidth=2)\n",
    "\n",
    "for name in models.keys():\n",
    "    plt.plot(predictions_df[name].values, label=name, linestyle=\"--\")\n",
    "\n",
    "plt.legend()\n",
    "plt.title(\"Actual vs Predicted Values\")\n",
    "plt.xlabel(\"Samples\")\n",
    "plt.ylabel(\"Solar Energy Output\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "# Список моделей и их названий\n",
    "models = {\n",
    "    \"KNN\": knn_best,\n",
    "    \"Decision_Tree\": dt_best,\n",
    "    \"Random_Forest\": rf_best,\n",
    "    \"CatBoost\": cat_best,\n",
    "    \"MLP\": mlp_best\n",
    "}\n",
    "\n",
    "# Сохраняем каждую модель в файл .pkl\n",
    "for name, model in models.items():\n",
    "    with open(f\"{name}.pkl\", \"wb\") as file:\n",
    "        pickle.dump(model, file)\n",
    "\n",
    "print(\"Все модели сохранены в формате .pkl!\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}