{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "ojlhGzdxhkwR" }, "outputs": [], "source": [ "## Pandas. Загрузка библиотек" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "DqYWosnHhkwU" }, "outputs": [], "source": [ "import pandas as pd # Загружаем библиотеку Pandas\n", "import numpy as np # Загружаем библиотеку numpy" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "vX8TZMleKHiw" }, "outputs": [], "source": [ "data = pd.read_csv('wine_base.csv', index_col = 0) # С помощью метода read_csv загружаем файл wine_base.csv и записываем данные в data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "ggbR-UGu7wJZ", "outputId": "735190d5-000c-479d-fb04-d7973bd58dd4" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USThis tremendous 100% varietal wine hails from ...Martha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
2USMac Watson honors the memory of a wine once ma...Special Selected Late Harvest9690.0CaliforniaKnights ValleySonomaSauvignon BlancMacauley
3USThis spent 20 months in 30% new French oak, an...Reserve9665.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi
4FranceThis is the top wine from La Bégude, named aft...La Brûlade9566.0ProvenceBandolNaNProvence red blendDomaine de la Bégude
\n", "
" ], "text/plain": [ " country description \\\n", "0 US This tremendous 100% varietal wine hails from ... \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "2 US Mac Watson honors the memory of a wine once ma... \n", "3 US This spent 20 months in 30% new French oak, an... \n", "4 France This is the top wine from La Bégude, named aft... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "2 Special Selected Late Harvest 96 90.0 California \n", "3 Reserve 96 65.0 Oregon \n", "4 La Brûlade 95 66.0 Provence \n", "\n", " region_1 region_2 variety \\\n", "0 Napa Valley Napa Cabernet Sauvignon \n", "1 Toro NaN Tinta de Toro \n", "2 Knights Valley Sonoma Sauvignon Blanc \n", "3 Willamette Valley Willamette Valley Pinot Noir \n", "4 Bandol NaN Provence red blend \n", "\n", " winery \n", "0 Heitz \n", "1 Bodega Carmen Rodríguez \n", "2 Macauley \n", "3 Ponzi \n", "4 Domaine de la Bégude " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "_mybfuLlhkzU" }, "outputs": [], "source": [ "## 1. Конкатенация\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "vUn3NnurhkzV", "outputId": "11da5650-2592-40f3-c0d6-59a503352985" }, "outputs": [ { "data": { "text/plain": [ "[4, 5, 6, 7, 234, 23, 0, 1, 2, 3, 4, 5, 6, 7, 234, 23, 0, 1, 2, 3, 0, 1, 2, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = [0, 1, 2, 3] # Создаем список а\n", "b = [4, 5, 6, 7, 234, 23] # Создаем список b\n", "b + a + b + a + a # Делаем конкатенацию (объединение) списков" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "DzVWtFnXhkzX" }, "outputs": [], "source": [ "data_1 = data[0:15].copy() # Берем первые 15 строк из data и делаем их копию в data_1\n", "data_2 = data[15:30].copy() # Берем строки с 15 по 30 из data и делаем их копию в data_2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "T2Nq4UnUhkza", "outputId": "0d7c713f-8eae-4e25-878c-c7e1cf26f8d5" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
15USFirst made in 2006, this succulent luscious Ch...Sigrid9590.0OregonWillamette ValleyWillamette ValleyChardonnayBergström
16USThis blockbuster, powerhouse of a wine suggest...Rainin Vineyard95325.0CaliforniaDiamond Mountain DistrictNapaCabernet SauvignonHall
17SpainNicely oaked blackberry, licorice, vanilla and...6 Años Reserva Premium9580.0Northern SpainRibera del DueroNaNTempranilloValduero
18FranceComing from a seven-acre vineyard named after ...Le Pigeonnier95290.0Southwest FranceCahorsNaNMalbecChâteau Lagrézette
19USThis fresh and lively medium-bodied wine is be...Gap's Crown Vineyard9575.0CaliforniaSonoma CoastSonomaPinot NoirGary Farrell
\n", "
" ], "text/plain": [ " country description \\\n", "15 US First made in 2006, this succulent luscious Ch... \n", "16 US This blockbuster, powerhouse of a wine suggest... \n", "17 Spain Nicely oaked blackberry, licorice, vanilla and... \n", "18 France Coming from a seven-acre vineyard named after ... \n", "19 US This fresh and lively medium-bodied wine is be... \n", "\n", " designation points price province \\\n", "15 Sigrid 95 90.0 Oregon \n", "16 Rainin Vineyard 95 325.0 California \n", "17 6 Años Reserva Premium 95 80.0 Northern Spain \n", "18 Le Pigeonnier 95 290.0 Southwest France \n", "19 Gap's Crown Vineyard 95 75.0 California \n", "\n", " region_1 region_2 variety \\\n", "15 Willamette Valley Willamette Valley Chardonnay \n", "16 Diamond Mountain District Napa Cabernet Sauvignon \n", "17 Ribera del Duero NaN Tempranillo \n", "18 Cahors NaN Malbec \n", "19 Sonoma Coast Sonoma Pinot Noir \n", "\n", " winery \n", "15 Bergström \n", "16 Hall \n", "17 Valduero \n", "18 Château Lagrézette \n", "19 Gary Farrell " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_2.head() # Выводим первые 5 строк data_2" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "hQn7iScFhkzb", "outputId": "1dfa87c8-4278-49f5-c1d4-8507d2d13667" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USThis tremendous 100% varietal wine hails from ...Martha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
2USMac Watson honors the memory of a wine once ma...Special Selected Late Harvest9690.0CaliforniaKnights ValleySonomaSauvignon BlancMacauley
3USThis spent 20 months in 30% new French oak, an...Reserve9665.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi
4FranceThis is the top wine from La Bégude, named aft...La Brûlade9566.0ProvenceBandolNaNProvence red blendDomaine de la Bégude
5SpainDeep, dense and pure from the opening bell, th...Numanthia9573.0Northern SpainToroNaNTinta de ToroNumanthia
6SpainSlightly gritty black-fruit aromas include a s...San Román9565.0Northern SpainToroNaNTinta de ToroMaurodos
7SpainLush cedary black-fruit aromas are luxe and of...Carodorum Único Crianza95110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
8USThis re-named vineyard was formerly bottled as...Silice9565.0OregonChehalem MountainsWillamette ValleyPinot NoirBergström
9USThe producer sources from two blocks of the vi...Gap's Crown Vineyard9560.0CaliforniaSonoma CoastSonomaPinot NoirBlue Farm
10ItalyElegance, complexity and structure come togeth...Ronco della Chiesa9580.0Northeastern ItalyCollioNaNFriulanoBorgo del Tiglio
11USFrom 18-year-old vines, this supple well-balan...Estate Vineyard Wadensvil Block9548.0OregonRibbon RidgeWillamette ValleyPinot NoirPatricia Green Cellars
12USA standout even in this terrific lineup of 201...Weber Vineyard9548.0OregonDundee HillsWillamette ValleyPinot NoirPatricia Green Cellars
13FranceThis wine is in peak condition. The tannins an...Château Montus Prestige9590.0Southwest FranceMadiranNaNTannatVignobles Brumont
14USWith its sophisticated mix of mineral, acid an...Grace Vineyard95185.0OregonDundee HillsWillamette ValleyPinot NoirDomaine Serene
15USFirst made in 2006, this succulent luscious Ch...Sigrid9590.0OregonWillamette ValleyWillamette ValleyChardonnayBergström
16USThis blockbuster, powerhouse of a wine suggest...Rainin Vineyard95325.0CaliforniaDiamond Mountain DistrictNapaCabernet SauvignonHall
17SpainNicely oaked blackberry, licorice, vanilla and...6 Años Reserva Premium9580.0Northern SpainRibera del DueroNaNTempranilloValduero
18FranceComing from a seven-acre vineyard named after ...Le Pigeonnier95290.0Southwest FranceCahorsNaNMalbecChâteau Lagrézette
19USThis fresh and lively medium-bodied wine is be...Gap's Crown Vineyard9575.0CaliforniaSonoma CoastSonomaPinot NoirGary Farrell
20USHeitz has made this stellar rosé from the rare...Grignolino9524.0CaliforniaNapa ValleyNapaRoséHeitz
21SpainAlluring, complex and powerful aromas of grill...Prado Enea Gran Reserva9579.0Northern SpainRiojaNaNTempranillo BlendMuga
22SpainTarry blackberry and cheesy oak aromas are app...Termanthia95220.0Northern SpainToroNaNTinta de ToroNumanthia
23USThe apogee of this ambitious winery's white wi...Giallo Solare9560.0CaliforniaEdna ValleyCentral CoastChardonnayCenter of Effort
24USSan Jose-based producer Adam Comartin heads 1,...R-Bar-R Ranch9545.0CaliforniaSanta Cruz MountainsCentral CoastPinot NoirComartin
25New ZealandYields were down in 2015, but intensity is up,...Maté's Vineyard9457.0KumeuNaNNaNChardonnayKumeu River
26USBergström has made a Shea designate since 2003...Shea Vineyard9462.0OregonWillamette ValleyNaNPinot NoirBergström
27USFocused and dense, this intense wine captures ...Abetina94105.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi
28USCranberry, baked rhubarb, anise and crushed sl...Garys' Vineyard9460.0CaliforniaSanta Lucia HighlandsCentral CoastPinot NoirRoar
29USThis standout Rocks District wine brings earth...The Funk Estate9460.0WashingtonWalla Walla Valley (WA)Columbia ValleySyrahSaviah
\n", "
" ], "text/plain": [ " country description \\\n", "0 US This tremendous 100% varietal wine hails from ... \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "2 US Mac Watson honors the memory of a wine once ma... \n", "3 US This spent 20 months in 30% new French oak, an... \n", "4 France This is the top wine from La Bégude, named aft... \n", "5 Spain Deep, dense and pure from the opening bell, th... \n", "6 Spain Slightly gritty black-fruit aromas include a s... \n", "7 Spain Lush cedary black-fruit aromas are luxe and of... \n", "8 US This re-named vineyard was formerly bottled as... \n", "9 US The producer sources from two blocks of the vi... \n", "10 Italy Elegance, complexity and structure come togeth... \n", "11 US From 18-year-old vines, this supple well-balan... \n", "12 US A standout even in this terrific lineup of 201... \n", "13 France This wine is in peak condition. The tannins an... \n", "14 US With its sophisticated mix of mineral, acid an... \n", "15 US First made in 2006, this succulent luscious Ch... \n", "16 US This blockbuster, powerhouse of a wine suggest... \n", "17 Spain Nicely oaked blackberry, licorice, vanilla and... \n", "18 France Coming from a seven-acre vineyard named after ... \n", "19 US This fresh and lively medium-bodied wine is be... \n", "20 US Heitz has made this stellar rosé from the rare... \n", "21 Spain Alluring, complex and powerful aromas of grill... \n", "22 Spain Tarry blackberry and cheesy oak aromas are app... \n", "23 US The apogee of this ambitious winery's white wi... \n", "24 US San Jose-based producer Adam Comartin heads 1,... \n", "25 New Zealand Yields were down in 2015, but intensity is up,... \n", "26 US Bergström has made a Shea designate since 2003... \n", "27 US Focused and dense, this intense wine captures ... \n", "28 US Cranberry, baked rhubarb, anise and crushed sl... \n", "29 US This standout Rocks District wine brings earth... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "2 Special Selected Late Harvest 96 90.0 California \n", "3 Reserve 96 65.0 Oregon \n", "4 La Brûlade 95 66.0 Provence \n", "5 Numanthia 95 73.0 Northern Spain \n", "6 San Román 95 65.0 Northern Spain \n", "7 Carodorum Único Crianza 95 110.0 Northern Spain \n", "8 Silice 95 65.0 Oregon \n", "9 Gap's Crown Vineyard 95 60.0 California \n", "10 Ronco della Chiesa 95 80.0 Northeastern Italy \n", "11 Estate Vineyard Wadensvil Block 95 48.0 Oregon \n", "12 Weber Vineyard 95 48.0 Oregon \n", "13 Château Montus Prestige 95 90.0 Southwest France \n", "14 Grace Vineyard 95 185.0 Oregon \n", "15 Sigrid 95 90.0 Oregon \n", "16 Rainin Vineyard 95 325.0 California \n", "17 6 Años Reserva Premium 95 80.0 Northern Spain \n", "18 Le Pigeonnier 95 290.0 Southwest France \n", "19 Gap's Crown Vineyard 95 75.0 California \n", "20 Grignolino 95 24.0 California \n", "21 Prado Enea Gran Reserva 95 79.0 Northern Spain \n", "22 Termanthia 95 220.0 Northern Spain \n", "23 Giallo Solare 95 60.0 California \n", "24 R-Bar-R Ranch 95 45.0 California \n", "25 Maté's Vineyard 94 57.0 Kumeu \n", "26 Shea Vineyard 94 62.0 Oregon \n", "27 Abetina 94 105.0 Oregon \n", "28 Garys' Vineyard 94 60.0 California \n", "29 The Funk Estate 94 60.0 Washington \n", "\n", " region_1 region_2 variety \\\n", "0 Napa Valley Napa Cabernet Sauvignon \n", "1 Toro NaN Tinta de Toro \n", "2 Knights Valley Sonoma Sauvignon Blanc \n", "3 Willamette Valley Willamette Valley Pinot Noir \n", "4 Bandol NaN Provence red blend \n", "5 Toro NaN Tinta de Toro \n", "6 Toro NaN Tinta de Toro \n", "7 Toro NaN Tinta de Toro \n", "8 Chehalem Mountains Willamette Valley Pinot Noir \n", "9 Sonoma Coast Sonoma Pinot Noir \n", "10 Collio NaN Friulano \n", "11 Ribbon Ridge Willamette Valley Pinot Noir \n", "12 Dundee Hills Willamette Valley Pinot Noir \n", "13 Madiran NaN Tannat \n", "14 Dundee Hills Willamette Valley Pinot Noir \n", "15 Willamette Valley Willamette Valley Chardonnay \n", "16 Diamond Mountain District Napa Cabernet Sauvignon \n", "17 Ribera del Duero NaN Tempranillo \n", "18 Cahors NaN Malbec \n", "19 Sonoma Coast Sonoma Pinot Noir \n", "20 Napa Valley Napa Rosé \n", "21 Rioja NaN Tempranillo Blend \n", "22 Toro NaN Tinta de Toro \n", "23 Edna Valley Central Coast Chardonnay \n", "24 Santa Cruz Mountains Central Coast Pinot Noir \n", "25 NaN NaN Chardonnay \n", "26 Willamette Valley NaN Pinot Noir \n", "27 Willamette Valley Willamette Valley Pinot Noir \n", "28 Santa Lucia Highlands Central Coast Pinot Noir \n", "29 Walla Walla Valley (WA) Columbia Valley Syrah \n", "\n", " winery \n", "0 Heitz \n", "1 Bodega Carmen Rodríguez \n", "2 Macauley \n", "3 Ponzi \n", "4 Domaine de la Bégude \n", "5 Numanthia \n", "6 Maurodos \n", "7 Bodega Carmen Rodríguez \n", "8 Bergström \n", "9 Blue Farm \n", "10 Borgo del Tiglio \n", "11 Patricia Green Cellars \n", "12 Patricia Green Cellars \n", "13 Vignobles Brumont \n", "14 Domaine Serene \n", "15 Bergström \n", "16 Hall \n", "17 Valduero \n", "18 Château Lagrézette \n", "19 Gary Farrell \n", "20 Heitz \n", "21 Muga \n", "22 Numanthia \n", "23 Center of Effort \n", "24 Comartin \n", "25 Kumeu River \n", "26 Bergström \n", "27 Ponzi \n", "28 Roar \n", "29 Saviah " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_concat = pd.concat([data_1, data_2]) # Объединяем два датафрейма\n", "df_concat # Отображаем объединенный датафрейм" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "Y36JPmmyhkzd", "outputId": "3c39bb6e-2353-4360-a4fa-c9dc0af22d51" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USThis tremendous 100% varietal wine hails from ...Martha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
2USMac Watson honors the memory of a wine once ma...Special Selected Late Harvest9690.0CaliforniaKnights ValleySonomaSauvignon BlancMacauley
3USThis spent 20 months in 30% new French oak, an...Reserve9665.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi
4FranceThis is the top wine from La Bégude, named aft...La Brûlade9566.0ProvenceBandolNaNProvence red blendDomaine de la Bégude
\n", "
" ], "text/plain": [ " country description \\\n", "0 US This tremendous 100% varietal wine hails from ... \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "2 US Mac Watson honors the memory of a wine once ma... \n", "3 US This spent 20 months in 30% new French oak, an... \n", "4 France This is the top wine from La Bégude, named aft... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "2 Special Selected Late Harvest 96 90.0 California \n", "3 Reserve 96 65.0 Oregon \n", "4 La Brûlade 95 66.0 Provence \n", "\n", " region_1 region_2 variety \\\n", "0 Napa Valley Napa Cabernet Sauvignon \n", "1 Toro NaN Tinta de Toro \n", "2 Knights Valley Sonoma Sauvignon Blanc \n", "3 Willamette Valley Willamette Valley Pinot Noir \n", "4 Bandol NaN Provence red blend \n", "\n", " winery \n", "0 Heitz \n", "1 Bodega Carmen Rodríguez \n", "2 Macauley \n", "3 Ponzi \n", "4 Domaine de la Bégude " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_1.head() # Выводим первые 5 строк data_1" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 111 }, "id": "2H9dAxpQhkze", "outputId": "06098996-d31b-4409-ddfa-773a3fa8bef2" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USKerasMartha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
\n", "
" ], "text/plain": [ " country description \\\n", "0 US Keras \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "\n", " region_1 region_2 variety winery \n", "0 Napa Valley Napa Cabernet Sauvignon Heitz \n", "1 Toro NaN Tinta de Toro Bodega Carmen Rodríguez " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_1.iloc[0,1] = 'Keras' # Изменяем значение в ячейке data_1\n", "data_1.head(2) # Выводим на экран 2 первые строки data_1" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 111 }, "id": "z_98pMQ9MHFm", "outputId": "671b4223-d59a-4230-c0d2-7b3e33dc1ff4" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USThis tremendous 100% varietal wine hails from ...Martha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
\n", "
" ], "text/plain": [ " country description \\\n", "0 US This tremendous 100% varietal wine hails from ... \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "\n", " region_1 region_2 variety winery \n", "0 Napa Valley Napa Cabernet Sauvignon Heitz \n", "1 Toro NaN Tinta de Toro Bodega Carmen Rodríguez " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_concat.head(2) # Выводим на экран 2 первые строки df_concat" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "0Chntu25WIF2" }, "outputs": [], "source": [ "## 2. Добавление столбцов и строк" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "niLdtd4xWMb0", "outputId": "48bda55e-a9ef-4026-fbd1-70f9021f2b84" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
01234
15678
29101112
313141516
\n", "
" ], "text/plain": [ " A B C D\n", "0 1 2 3 4\n", "1 5 6 7 8\n", "2 9 10 11 12\n", "3 13 14 15 16" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = ['A', 'B', 'C', 'D'] # Создаем список с названием столбцов\n", "values = [[1,2,3,4], [5,6,7,8],[9,10,11,12], [13,14,15,16]] # Создаем список со значениями\n", "df_for_add = pd.DataFrame(values, columns=columns) # Создаем датафрейм из сформированных значений\n", "df_for_add # Выводим на экран датафрейм df_for_add" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "NGK40s7PWhcK", "outputId": "2507bf8f-a895-4adf-e43c-a245664eac7d" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
0123417
1567818
2910111219
31314151620
\n", "
" ], "text/plain": [ " A B C D E\n", "0 1 2 3 4 17\n", "1 5 6 7 8 18\n", "2 9 10 11 12 19\n", "3 13 14 15 16 20" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_add['E'] = [17,18,19,20] # Добавляем столбец\n", "df_for_add # Выводим на экран датафрейм df_for_add" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "WVt6bsnC9l90", "outputId": "29cb192e-bb63-4128-b191-7ff683162b54" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
0123417
1567817
2910111217
31314151627
\n", "
" ], "text/plain": [ " A B C D E\n", "0 1 2 3 4 17\n", "1 5 6 7 8 17\n", "2 9 10 11 12 17\n", "3 13 14 15 16 27" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_add['E'] = [17,17,17,27] # Добавляем столбец\n", "df_for_add # Выводим на экран датафрейм df_for_add" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "T2UEW9MfW0pj", "outputId": "de7a0376-366f-4621-f8fc-5828e76ac008" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
0123417
1567817
2910111217
31314151627
412345
\n", "
" ], "text/plain": [ " A B C D E\n", "0 1 2 3 4 17\n", "1 5 6 7 8 17\n", "2 9 10 11 12 17\n", "3 13 14 15 16 27\n", "4 1 2 3 4 5" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_add.loc[4] = [1,2,3,4,5] # Добавляем строку \n", "df_for_add # Выводим на экран датафрейм df_for_add" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "ZFRI_F9L-CGs", "outputId": "57289a3c-a004-4503-f353-5a3bc2cefb19" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
0123417
1567817
2910111217
31314151627
412345
812345
\n", "
" ], "text/plain": [ " A B C D E\n", "0 1 2 3 4 17\n", "1 5 6 7 8 17\n", "2 9 10 11 12 17\n", "3 13 14 15 16 27\n", "4 1 2 3 4 5\n", "8 1 2 3 4 5" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_for_add.loc[8] = [1,2,3,4,5] # Добавляем строку \n", "df_for_add # Выводим на экран датафрейм df_for_add" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "Af1_T2N8jAfF" }, "outputs": [], "source": [ "## 3. Groupby" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "3QFEHZdYjAew", "outputId": "c51e1577-23d8-4707-cf6c-42fc94d0fd86" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrydescriptiondesignationpointspriceprovinceregion_1region_2varietywinery
0USThis tremendous 100% varietal wine hails from ...Martha's Vineyard96235.0CaliforniaNapa ValleyNapaCabernet SauvignonHeitz
1SpainRipe aromas of fig, blackberry and cassis are ...Carodorum Selección Especial Reserva96110.0Northern SpainToroNaNTinta de ToroBodega Carmen Rodríguez
2USMac Watson honors the memory of a wine once ma...Special Selected Late Harvest9690.0CaliforniaKnights ValleySonomaSauvignon BlancMacauley
3USThis spent 20 months in 30% new French oak, an...Reserve9665.0OregonWillamette ValleyWillamette ValleyPinot NoirPonzi
4FranceThis is the top wine from La Bégude, named aft...La Brûlade9566.0ProvenceBandolNaNProvence red blendDomaine de la Bégude
\n", "
" ], "text/plain": [ " country description \\\n", "0 US This tremendous 100% varietal wine hails from ... \n", "1 Spain Ripe aromas of fig, blackberry and cassis are ... \n", "2 US Mac Watson honors the memory of a wine once ma... \n", "3 US This spent 20 months in 30% new French oak, an... \n", "4 France This is the top wine from La Bégude, named aft... \n", "\n", " designation points price province \\\n", "0 Martha's Vineyard 96 235.0 California \n", "1 Carodorum Selección Especial Reserva 96 110.0 Northern Spain \n", "2 Special Selected Late Harvest 96 90.0 California \n", "3 Reserve 96 65.0 Oregon \n", "4 La Brûlade 95 66.0 Provence \n", "\n", " region_1 region_2 variety \\\n", "0 Napa Valley Napa Cabernet Sauvignon \n", "1 Toro NaN Tinta de Toro \n", "2 Knights Valley Sonoma Sauvignon Blanc \n", "3 Willamette Valley Willamette Valley Pinot Noir \n", "4 Bandol NaN Provence red blend \n", "\n", " winery \n", "0 Heitz \n", "1 Bodega Carmen Rodríguez \n", "2 Macauley \n", "3 Ponzi \n", "4 Domaine de la Bégude " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head() # Выведем первые 5 строк data" ] }, { "cell_type": "markdown", "metadata": { "id": "ItPspZfLjAfL" }, "source": [ "Groupby - это очень важный и широко используемый метод. Позволяет делать группировку данных по какому-либо столбцу. \n", "\n", "Пример:\n", "* посчитать средние баллы и цену в разрезе по странам и провинциям" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "UM89p_pcjAfO", "outputId": "dee3be1a-1cf3-44cb-8af2-4390811c9cce" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/wx/b8w5bvpx5hjdgyvqlfp38x940000gn/T/ipykernel_15726/32687945.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " df_mean = data.groupby(['country']).mean() # Группируем данные по странам и считаем среднее значение (считается для числовых столбцов)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointsprice
country
Albania88.00000020.000000
Argentina85.99609320.794881
Australia87.89247531.258480
Austria89.27674231.192106
Bosnia and Herzegovina84.75000012.750000
\n", "
" ], "text/plain": [ " points price\n", "country \n", "Albania 88.000000 20.000000\n", "Argentina 85.996093 20.794881\n", "Australia 87.892475 31.258480\n", "Austria 89.276742 31.192106\n", "Bosnia and Herzegovina 84.750000 12.750000" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mean = data.groupby(['country']).mean() # Группируем данные по странам и считаем среднее значение (считается для числовых столбцов)\n", "df_mean.head() # Выводим первые 5 строк df_mean" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 269 }, "id": "xLhe1562suR1", "outputId": "26e6853b-cb62-464f-9f4e-48f912c84429" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/wx/b8w5bvpx5hjdgyvqlfp38x940000gn/T/ipykernel_15726/30053901.py:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n", " df_max = data.groupby('country')['points','price'].max() # Группируем данные по странам и считаем максимальное значение\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointsprice
country
Albania8820.0
Argentina97250.0
Australia100850.0
Austria981100.0
Bosnia and Herzegovina8813.0
\n", "
" ], "text/plain": [ " points price\n", "country \n", "Albania 88 20.0\n", "Argentina 97 250.0\n", "Australia 100 850.0\n", "Austria 98 1100.0\n", "Bosnia and Herzegovina 88 13.0" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_max = data.groupby('country')['points','price'].max() # Группируем данные по странам и считаем максимальное значение\n", "df_max.head() # # Выводим первые 5 строк df_max" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 390 }, "id": "6RifUrsujAfY", "outputId": "ca2d153c-a2bc-4491-b6cd-c5ac1fa1f588" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointsprice
country
England92.88888947.500000
Austria89.27674231.192106
France88.92587045.619885
Germany88.62642739.011078
Italy88.41366437.547913
Canada88.23979634.628866
Slovenia88.23404328.061728
Morocco88.16666718.833333
Turkey88.09615425.800000
Portugal88.05768526.332615
\n", "
" ], "text/plain": [ " points price\n", "country \n", "England 92.888889 47.500000\n", "Austria 89.276742 31.192106\n", "France 88.925870 45.619885\n", "Germany 88.626427 39.011078\n", "Italy 88.413664 37.547913\n", "Canada 88.239796 34.628866\n", "Slovenia 88.234043 28.061728\n", "Morocco 88.166667 18.833333\n", "Turkey 88.096154 25.800000\n", "Portugal 88.057685 26.332615" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# сортировка по столбцу points\n", "sorted_data = df_mean.sort_values('points', ascending=False) # Сортируем датафрейм со средними значениями по убыванию points\n", "sorted_data.head(10) # Выводим первые 10 строк sorted_data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 390 }, "id": "_7_n-qrOjAfq", "outputId": "ce1da65e-7c1b-4ad1-9634-96b814600ea1" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/wx/b8w5bvpx5hjdgyvqlfp38x940000gn/T/ipykernel_15726/1962610003.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n", " data.groupby(['country', 'region_1', 'region_2']).mean().head(10) # Выполняем группировку по трем столбцам (считаем среднее значение) и выводим первые 10 строк\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pointsprice
countryregion_1region_2
USAdelaida DistrictCentral Coast90.12000048.560000
Alexander ValleySonoma87.77348635.901582
Alta MesaCentral Valley87.18181815.636364
Amador CountySierra Foothills86.67679024.341304
Amador-NapaCalifornia Other84.00000012.000000
Ancient LakesColumbia Valley86.55555618.777778
Anderson ValleyMendocino/Lake Counties89.61747040.626506
Anderson Valley-Sonoma County-Cole RanchNorth Coast87.00000018.000000
Antelope Valley of the California High DesertSouth Coast89.00000021.000000
Applegate ValleySouthern Oregon87.39240531.253165
\n", "
" ], "text/plain": [ " points \\\n", "country region_1 region_2 \n", "US Adelaida District Central Coast 90.120000 \n", " Alexander Valley Sonoma 87.773486 \n", " Alta Mesa Central Valley 87.181818 \n", " Amador County Sierra Foothills 86.676790 \n", " Amador-Napa California Other 84.000000 \n", " Ancient Lakes Columbia Valley 86.555556 \n", " Anderson Valley Mendocino/Lake Counties 89.617470 \n", " Anderson Valley-Sonoma County-Cole Ranch North Coast 87.000000 \n", " Antelope Valley of the California High Desert South Coast 89.000000 \n", " Applegate Valley Southern Oregon 87.392405 \n", "\n", " price \n", "country region_1 region_2 \n", "US Adelaida District Central Coast 48.560000 \n", " Alexander Valley Sonoma 35.901582 \n", " Alta Mesa Central Valley 15.636364 \n", " Amador County Sierra Foothills 24.341304 \n", " Amador-Napa California Other 12.000000 \n", " Ancient Lakes Columbia Valley 18.777778 \n", " Anderson Valley Mendocino/Lake Counties 40.626506 \n", " Anderson Valley-Sonoma County-Cole Ranch North Coast 18.000000 \n", " Antelope Valley of the California High Desert South Coast 21.000000 \n", " Applegate Valley Southern Oregon 31.253165 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# группировка по трем столбцам\n", "data.groupby(['country', 'region_1', 'region_2']).mean().head(10) # Выполняем группировку по трем столбцам (считаем среднее значение) и выводим первые 10 строк" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "id": "X6KDR8HPjAf3" }, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (1262463914.py, line 3)", "output_type": "error", "traceback": [ "\u001b[0;36m Cell \u001b[0;32mIn[25], line 3\u001b[0;36m\u001b[0m\n\u001b[0;31m Join - позволяет объединять несколько таблиц в единую по ключам (одинаковым столбцам).\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "## 4. Join\n", "\n", "Join - позволяет объединять несколько таблиц в единую по ключам (одинаковым столбцам). \n", "\n", "Join бывает разный:\n", "* Left\n", "* Right\n", "* Inner\n", "* FullJoin" ] }, { "cell_type": "markdown", "metadata": { "id": "fI0-DwH7jAf-" }, "source": [ "Посмотрим на примерах, как это работает.\n", "\n", "Допустим, у нас есть два датафрейма:\n", "* первый содержит информацию о студентах и ID курсах, которые они посещают,\n", "* второй - описание курсов." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "8LQUr6fBjAgB", "outputId": "a39750b8-76dc-4be6-da45-239a8690d30a" }, "outputs": [], "source": [ "# создаем первый датафрейм\n", "data_1 = np.array([['100500', '200600', '100500', '300700', '200600', '500900'],\n", " ['Анастасия', 'Екатерина', 'Светлана', 'Максим', 'Станислав', 'Данила'], \n", " ['Андреева', 'Петрова', 'Иванова', 'Егоров', 'Лесницкий', 'Кудряшев']])\n", "\n", "names = pd.DataFrame(data_1.T, columns = ['subject_id', 'first_name', 'last_name']) # Создаем датафрейм из сформированных данных\n", "names # Выводим датафрейм names на экран" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "gIFHRi2QjAgS", "outputId": "5e70e778-0d82-4903-924e-51c52d56dce8" }, "outputs": [], "source": [ "# создаем второй датафрейм\n", "data_2 = np.array([['100500', '200600', '300700', '400700'],\n", " ['Математика', 'Программирование', 'Анализ данных', 'Мат. анализ'], \n", " ['продвинутые методы математической статистики', 'расширенный курс по с++',\\\n", " 'краткий курс для чайников', 'многомерный анализ']])\n", "subjects = pd.DataFrame(data_2.T, columns = ['subject_id','course_name', 'description']) # Создаем датафрейм из сформированных данных\n", "subjects # Выводим датафрейм subjects на экран" ] }, { "cell_type": "markdown", "metadata": { "id": "ME_fbI0FjAge" }, "source": [ "#### Делаем Join:\n", "\n", "Left Join" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "BgXi9IbYjAgg", "outputId": "8718fb50-57cc-46a6-d258-cafc2806e61f" }, "outputs": [], "source": [ "names.merge(subjects, on='subject_id', how='left') # Объединяем два датафрейма по столбцу subject_id (join-left)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "oG6gcJB0_n8U", "outputId": "9892e965-72d8-4c2f-aea9-2549271b3b8f" }, "outputs": [], "source": [ "pd.merge(names, subjects, on='subject_id', how='right') # Объединяем два датафрейма по столбцу subject_id (join-right)" ] }, { "cell_type": "markdown", "metadata": { "id": "H4M4MQfsjAgm" }, "source": [ "Inner Join" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "Fr9FeDknjAgq", "outputId": "eae99595-f9f3-4106-b81b-c6171911d70d" }, "outputs": [], "source": [ "names.merge(subjects, on='subject_id', how='inner') # Объединяем два датафрейма по столбцу subject_id (join-inner)" ] }, { "cell_type": "markdown", "metadata": { "id": "vi9i7xeFjAhH" }, "source": [ "Outer Join" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "2oCn_zOrjAhJ", "outputId": "b8c898ff-c6b8-41e7-b6d6-fb8dce7210e5" }, "outputs": [], "source": [ "df = names.merge(subjects, on='subject_id', how='outer') # Объединяем два датафрейма по столбцу subject_id (join-outer)\n", "df" ] }, { "cell_type": "markdown", "metadata": { "id": "8YCox3IxjAhW" }, "source": [ "Join при разных названиях столбцов с ключом:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "oxIBiXBLjAhY", "outputId": "78fedded-83a6-47e6-81cf-3e3866f11685" }, "outputs": [], "source": [ "names_2 = names.copy() # Создаем копию датафрейма names\n", "cols = names.columns.tolist() # Получаем список названий столбцов\n", "cols[0] = 'ID' # Меняем название нулевого столбца на 'ID'\n", "names_2.columns = cols # Присваиваем датафрейму names_2 новые названия столбцов\n", "names_2 # Выводим на экран датафрейм names_2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "T68tbKU0jAhj", "outputId": "0bd367c2-ad2f-4317-d815-7162fbddf35a" }, "outputs": [], "source": [ "# делаем join\n", "pd.merge(names_2, subjects, left_on='ID', right_on='subject_id', how='inner') # Объединяем два датафрейма по столбцам ID и subject_id (join-inner)" ] }, { "cell_type": "markdown", "metadata": { "id": "Mg6fQ0UwjAhn" }, "source": [ "Join по двум ключам:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "s_BZm0fbjAhq" }, "outputs": [], "source": [ "cols_a = ['A', 'B', 'value_1'] # Создаем список названий колонок cols_a\n", "cols_b = ['A', 'B', 'value_2'] # Создаем список названий колонок cols_b\n", "data_a = [[1, 1, 23], [1, 2, 34], [2, 1, 2342], [2, 2, 333]] # Создаем список значение data_a\n", "data_b = [[1, 1, 0.1], [1, 2, 0.2], [2, 2, 0.13], [2, 3, 0.33]] # Создаем список значение data_b\n", "df_a = pd.DataFrame(data_a, columns=cols_a) # Формируем датафрейм df_a из созданных ранее данных\n", "df_b = pd.DataFrame(data_b, columns=cols_b) # Формируем датафрейм df_b из созданных ранее данных" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "I1bHptln_BWL", "outputId": "dc229fc9-30dc-4cbf-910d-2f78b983d51f" }, "outputs": [], "source": [ "df_a # Выводим на экран датафрейм df_a" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 173 }, "id": "UKsQpT1vjAh3", "outputId": "8200e7dd-b5bb-4e7d-e1b5-5d4c12d404f1" }, "outputs": [], "source": [ "df_b # Выводим на экран датафрейм df_b" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "5MiuPPQZjAiE", "outputId": "7a5164c9-b899-412a-ff05-1ab874d8dfe0" }, "outputs": [], "source": [ "df_a.merge(df_b, on=['A', 'B'], how='outer') # Объединяем два датафрейма по столбцам A и B (join-outer)" ] }, { "cell_type": "markdown", "metadata": { "id": "6cRasbOXjAiS" }, "source": [ "А join по одному ключу в данном случае выглядел бы так:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 297 }, "id": "GrTN-pM9jAiT", "outputId": "03c0f18d-fdc7-48db-c6ec-d1d50b11f7f8" }, "outputs": [], "source": [ "df_a.merge(df_b, on='A', how='inner') # Объединяем два датафрейма по столбцу A (join-inner)" ] }, { "cell_type": "markdown", "metadata": { "id": "9V7ArTkzjAia" }, "source": [ "Или так:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "2S02OpysjAic", "outputId": "74be38cf-e447-45bc-b926-5886a3753860" }, "outputs": [], "source": [ "df_a.merge(df_b, on='B', how='inner') # Объединяем два датафрейма по столбцу B (join-inner)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "reSNH-05jAil" }, "outputs": [], "source": [ "Или вот так по разным ключам:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "vwici4RdjAim", "outputId": "72122abd-ea12-4fc3-bf40-ce65ed8b376b" }, "outputs": [], "source": [ "pd.merge(df_a, df_b, left_on='A', right_on='B', how='inner') # Объединяем два датафрейма по столбцам A и B (join-inner)" ] }, { "cell_type": "markdown", "metadata": { "id": "6tg5a_WOjAit" }, "source": [ "Заполнение пропусков в одном из столбцов:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "RsK9n3vVAgK-", "outputId": "1b5bd072-4650-41e6-cf04-e270d5ac3cc6" }, "outputs": [], "source": [ "df # Выведем на экран датафрейм df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "eolkgC-0jAi2", "outputId": "9bdae116-fbd9-4181-9042-674be4fe0a9b" }, "outputs": [], "source": [ "df.first_name = df.first_name.fillna('Не заполнено') # Заменяем пропуски в столбце first_name на значение 'Не заполнено'\n", "df # Выводим на экран датафрейм df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "pWhjhgeBjAi9", "outputId": "b9d16d37-9b18-4db5-d27f-a32e176743d3" }, "outputs": [], "source": [ "df.last_name = df.last_name.fillna('Пропущено') # Заменяем пропуски в столбце last_name на значение 'Пропущено'\n", "df # Выводим на экран датафрейм df" ] }, { "cell_type": "markdown", "metadata": { "id": "XCtORa5gjAjK" }, "source": [ "Во всей табличке:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "NC3cBEusjAjL", "outputId": "241287e4-9036-43bd-f633-89ad814504a9" }, "outputs": [], "source": [ "df = df.fillna('Неизвестно') # Заменяем пропуски во всем датафрейме на значение 'Неизвестно'\n", "df # Выводим на экран датафрейм df" ] }, { "cell_type": "markdown", "metadata": { "id": "Zau_QVx8jAjV" }, "source": [ "Разными значениями в разных столбцах:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "QFNCSrSGjAjV", "outputId": "fb3975c1-e389-47a1-b591-6fc1f0cdaf08" }, "outputs": [], "source": [ "df = names.merge(subjects, on='subject_id', how='outer') # Объединяем два датафрейма по столбцу subject_id (join-outer)\n", "df # Выводим на экран ДатаФрейм df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "-Vc-1RmSjAje", "outputId": "0c800ab1-2ae5-4768-8fda-f1a288221c33" }, "outputs": [], "source": [ "val = ['Не заполнено', 'Пропущено', 'Неизвестный курс', 'Неизвестное описание'] # Создаем список со значениями, которые будут использоваться вместо пропусков\n", "cols = df.columns.tolist() # Получаем названия столбцов датафрейма df\n", "cols.pop(0) # Удаляем 0 столбец (соответствует subject_id)\n", "print(cols) # Выводим на экран список cols" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 85 }, "id": "sS1LJWsrjAjm", "outputId": "a01124db-f1f9-4d25-8f28-aabbb3617424" }, "outputs": [], "source": [ "# создаем словарь с меппингом названий столбцов к значениям, какими нужно заполнить в них пропуски\n", "dict_to_fill = dict(zip(cols, val)) # Создаем словарь, в котором в качестве ключей будут значения списка cols, а в качестве соответствующих значений - значения списка val\n", "dict_to_fill # Выводим на экран словарь dict_to_fill" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 266 }, "id": "9JythrBSjAjp", "outputId": "32c04fc8-b218-40b4-bfd4-ea79cbe893b8" }, "outputs": [], "source": [ "df = df.fillna(dict_to_fill) # Заменяем пропуски в соответствии со словарем dict_to_fill\n", "df # Выводим на экран датафрейм df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5_aJnqtHjAjs" }, "outputs": [], "source": [ "## 5. DateTime index\n", "\n", "DateTime Index - это особый тип индекса в pandas, предназначенный для работы с временными рядами. \n", "Давайте познакомимся с ним поближе. Для этой цели мы подгрузим известный стандартный датасет \"Occupancy Detection Data Set\", \n", "он содержит следующие данные:\n", "* дата и время замера\n", "* температура в градусах по Цельсию\n", "* относительная влажность в % \n", "* освещенность в Люксах\n", "* CO2 в ppm \n", "* коэффициент влажности\n", "* заполненность (1 - заполнено, 0 - не заполнено)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "xICkBqa4jAjt", "outputId": "37737fb0-d0e6-4bff-89b4-8c83c179886c" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateTemperatureHumidityLightCO2HumidityRatioOccupancy
12015-02-04 17:51:0023.1827.2720426.0721.250.0047931
22015-02-04 17:51:5923.1527.2675429.5714.000.0047831
32015-02-04 17:53:0023.1527.2450426.0713.500.0047791
42015-02-04 17:54:0023.1527.2000426.0708.250.0047721
52015-02-04 17:55:0023.1027.2000426.0704.500.0047571
\n", "
" ], "text/plain": [ " date Temperature Humidity Light CO2 HumidityRatio \\\n", "1 2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 \n", "2 2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 \n", "3 2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 \n", "4 2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 \n", "5 2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 \n", "\n", " Occupancy \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", "5 1 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining = pd.read_csv('datatraining.txt') # Создаем датафрейм из файла datatraining.txt\n", "dataTraining.head() # Отображаем первые 5 строк датафрейма dataTraining" ] }, { "cell_type": "markdown", "metadata": { "id": "eQHTSHF-jAjw" }, "source": [ "Давайте превратим колонку с датой в datetime index:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "zcJ96Gz4jAjx", "outputId": "6929acd1-b62d-401b-c95f-994f43bc9af9" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateTemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-04 17:51:002015-02-04 17:51:0023.1827.2720426.0721.250.0047931
2015-02-04 17:51:592015-02-04 17:51:5923.1527.2675429.5714.000.0047831
2015-02-04 17:53:002015-02-04 17:53:0023.1527.2450426.0713.500.0047791
2015-02-04 17:54:002015-02-04 17:54:0023.1527.2000426.0708.250.0047721
2015-02-04 17:55:002015-02-04 17:55:0023.1027.2000426.0704.500.0047571
\n", "
" ], "text/plain": [ " date Temperature Humidity Light \\\n", "date \n", "2015-02-04 17:51:00 2015-02-04 17:51:00 23.18 27.2720 426.0 \n", "2015-02-04 17:51:59 2015-02-04 17:51:59 23.15 27.2675 429.5 \n", "2015-02-04 17:53:00 2015-02-04 17:53:00 23.15 27.2450 426.0 \n", "2015-02-04 17:54:00 2015-02-04 17:54:00 23.15 27.2000 426.0 \n", "2015-02-04 17:55:00 2015-02-04 17:55:00 23.10 27.2000 426.0 \n", "\n", " CO2 HumidityRatio Occupancy \n", "date \n", "2015-02-04 17:51:00 721.25 0.004793 1 \n", "2015-02-04 17:51:59 714.00 0.004783 1 \n", "2015-02-04 17:53:00 713.50 0.004779 1 \n", "2015-02-04 17:54:00 708.25 0.004772 1 \n", "2015-02-04 17:55:00 704.50 0.004757 1 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining.index = pd.to_datetime(dataTraining.date) # Превращаем колонку date в индекс\n", "dataTraining.head() # Отображаем первые 5 строк датафрейма dataTraining" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "1Yb4I5iSEyag", "outputId": "c00a1497-f528-4ab2-b267-5b7d5d315491" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-04 17:51:0023.1827.2720426.0721.250.0047931
2015-02-04 17:51:5923.1527.2675429.5714.000.0047831
2015-02-04 17:53:0023.1527.2450426.0713.500.0047791
2015-02-04 17:54:0023.1527.2000426.0708.250.0047721
2015-02-04 17:55:0023.1027.2000426.0704.500.0047571
\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 HumidityRatio \\\n", "date \n", "2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 \n", "2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 \n", "2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 \n", "2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 \n", "2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 \n", "\n", " Occupancy \n", "date \n", "2015-02-04 17:51:00 1 \n", "2015-02-04 17:51:59 1 \n", "2015-02-04 17:53:00 1 \n", "2015-02-04 17:54:00 1 \n", "2015-02-04 17:55:00 1 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining.drop('date', axis=1, inplace=True) # Удаляем колонку date из датафрейма\n", "dataTraining.head() # Отображаем первые 5 строк датафрейма dataTraining" ] }, { "cell_type": "markdown", "metadata": { "id": "9xZ3hSlDjAj7" }, "source": [ "Давайте посмотрим, что можно делать с данными, имеющими временной индекс." ] }, { "cell_type": "markdown", "metadata": { "id": "3_YBy_F0jAj8" }, "source": [ "### 1. Подвыборки" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1aYyxuXUjAj-", "outputId": "f6584641-0f32-461a-f20b-3e7372f340bc" }, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2015-02-04 17:51:00', '2015-02-04 17:51:59',\n", " '2015-02-04 17:53:00', '2015-02-04 17:54:00',\n", " '2015-02-04 17:55:00', '2015-02-04 17:55:59',\n", " '2015-02-04 17:57:00', '2015-02-04 17:57:59',\n", " '2015-02-04 17:58:59', '2015-02-04 18:00:00',\n", " ...\n", " '2015-02-10 09:23:59', '2015-02-10 09:24:59',\n", " '2015-02-10 09:26:00', '2015-02-10 09:27:00',\n", " '2015-02-10 09:28:00', '2015-02-10 09:29:00',\n", " '2015-02-10 09:29:59', '2015-02-10 09:30:59',\n", " '2015-02-10 09:32:00', '2015-02-10 09:33:00'],\n", " dtype='datetime64[ns]', name='date', length=8143, freq=None)" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining.index # Выведем на экран колонку индексов" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 450 }, "id": "aFOWuFuojAkC", "outputId": "b611086c-2f28-4d4f-f0f4-d177f971fa71" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-04 17:51:0023.1827.2720426.0721.250.0047931
2015-02-04 17:51:5923.1527.2675429.5714.000.0047831
2015-02-04 17:53:0023.1527.2450426.0713.500.0047791
2015-02-04 17:54:0023.1527.2000426.0708.250.0047721
2015-02-04 17:55:0023.1027.2000426.0704.500.0047571
.....................
2015-02-06 23:55:5920.0018.74500.0435.000.0027030
2015-02-06 23:57:0020.0018.70000.0441.000.0026960
2015-02-06 23:57:5920.0018.70000.0441.000.0026960
2015-02-06 23:58:5920.0018.70000.0440.000.0026960
2015-02-07 00:00:0020.0018.70000.0438.000.0026960
\n", "

3250 rows × 6 columns

\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 HumidityRatio \\\n", "date \n", "2015-02-04 17:51:00 23.18 27.2720 426.0 721.25 0.004793 \n", "2015-02-04 17:51:59 23.15 27.2675 429.5 714.00 0.004783 \n", "2015-02-04 17:53:00 23.15 27.2450 426.0 713.50 0.004779 \n", "2015-02-04 17:54:00 23.15 27.2000 426.0 708.25 0.004772 \n", "2015-02-04 17:55:00 23.10 27.2000 426.0 704.50 0.004757 \n", "... ... ... ... ... ... \n", "2015-02-06 23:55:59 20.00 18.7450 0.0 435.00 0.002703 \n", "2015-02-06 23:57:00 20.00 18.7000 0.0 441.00 0.002696 \n", "2015-02-06 23:57:59 20.00 18.7000 0.0 441.00 0.002696 \n", "2015-02-06 23:58:59 20.00 18.7000 0.0 440.00 0.002696 \n", "2015-02-07 00:00:00 20.00 18.7000 0.0 438.00 0.002696 \n", "\n", " Occupancy \n", "date \n", "2015-02-04 17:51:00 1 \n", "2015-02-04 17:51:59 1 \n", "2015-02-04 17:53:00 1 \n", "2015-02-04 17:54:00 1 \n", "2015-02-04 17:55:00 1 \n", "... ... \n", "2015-02-06 23:55:59 0 \n", "2015-02-06 23:57:00 0 \n", "2015-02-06 23:57:59 0 \n", "2015-02-06 23:58:59 0 \n", "2015-02-07 00:00:00 0 \n", "\n", "[3250 rows x 6 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "period = dataTraining[(dataTraining.index>'2015-02-04') & (dataTraining.index<='2015-02-07')] # Формируем новый датафрейм period, в который запишем все данные с 4 по 7 февраля 2015 года\n", "period # Выведем на экран датафрейм period" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 450 }, "id": "rcz2NirgjAkI", "outputId": "5d4d6f64-5ac9-442e-cbc0-27ae3e6d67c6" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-04 17:51:0023.1827.2720426.0721.2500000.0047931
2015-02-04 17:51:5923.1527.2675429.5714.0000000.0047831
2015-02-04 17:53:0023.1527.2450426.0713.5000000.0047791
2015-02-04 17:54:0023.1527.2000426.0708.2500000.0047721
2015-02-04 17:55:0023.1027.2000426.0704.5000000.0047571
.....................
2015-02-10 09:29:0021.0536.0975433.0787.2500000.0055791
2015-02-10 09:29:5921.0535.9950433.0789.5000000.0055631
2015-02-10 09:30:5921.1036.0950433.0798.5000000.0055961
2015-02-10 09:32:0021.1036.2600433.0820.3333330.0056211
2015-02-10 09:33:0021.1036.2000447.0821.0000000.0056121
\n", "

8143 rows × 6 columns

\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 HumidityRatio \\\n", "date \n", "2015-02-04 17:51:00 23.18 27.2720 426.0 721.250000 0.004793 \n", "2015-02-04 17:51:59 23.15 27.2675 429.5 714.000000 0.004783 \n", "2015-02-04 17:53:00 23.15 27.2450 426.0 713.500000 0.004779 \n", "2015-02-04 17:54:00 23.15 27.2000 426.0 708.250000 0.004772 \n", "2015-02-04 17:55:00 23.10 27.2000 426.0 704.500000 0.004757 \n", "... ... ... ... ... ... \n", "2015-02-10 09:29:00 21.05 36.0975 433.0 787.250000 0.005579 \n", "2015-02-10 09:29:59 21.05 35.9950 433.0 789.500000 0.005563 \n", "2015-02-10 09:30:59 21.10 36.0950 433.0 798.500000 0.005596 \n", "2015-02-10 09:32:00 21.10 36.2600 433.0 820.333333 0.005621 \n", "2015-02-10 09:33:00 21.10 36.2000 447.0 821.000000 0.005612 \n", "\n", " Occupancy \n", "date \n", "2015-02-04 17:51:00 1 \n", "2015-02-04 17:51:59 1 \n", "2015-02-04 17:53:00 1 \n", "2015-02-04 17:54:00 1 \n", "2015-02-04 17:55:00 1 \n", "... ... \n", "2015-02-10 09:29:00 1 \n", "2015-02-10 09:29:59 1 \n", "2015-02-10 09:30:59 1 \n", "2015-02-10 09:32:00 1 \n", "2015-02-10 09:33:00 1 \n", "\n", "[8143 rows x 6 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "month_set = dataTraining[dataTraining.index.month == 2] # Формируем новый датафрейм month_set, в который запишем все данные по февралю любого года\n", "month_set # Выведем на экран датафрейм month_set" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 450 }, "id": "QjMdNFlHjAkM", "outputId": "6bbb6eee-1c3e-4309-ba55-77fa0ef0cb12" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-07 13:00:0023.1017.060000268.000000445.2500000.0029750
2015-02-07 13:01:0023.1017.033333268.250000447.5000000.0029700
2015-02-07 13:01:5923.1017.066667266.000000446.3333330.0029760
2015-02-07 13:02:5923.1017.000000270.250000453.5000000.0029650
2015-02-07 13:04:0023.1016.972500268.500000449.2500000.0029600
.....................
2015-02-07 13:56:0022.7017.050000144.750000442.5000000.0029020
2015-02-07 13:57:0022.7017.100000115.000000443.7500000.0029100
2015-02-07 13:58:0022.6517.100000131.000000445.0000000.0029020
2015-02-07 13:59:0022.6017.100000140.000000443.2500000.0028930
2015-02-07 13:59:5922.6017.100000167.666667444.0000000.0028930
\n", "

61 rows × 6 columns

\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 \\\n", "date \n", "2015-02-07 13:00:00 23.10 17.060000 268.000000 445.250000 \n", "2015-02-07 13:01:00 23.10 17.033333 268.250000 447.500000 \n", "2015-02-07 13:01:59 23.10 17.066667 266.000000 446.333333 \n", "2015-02-07 13:02:59 23.10 17.000000 270.250000 453.500000 \n", "2015-02-07 13:04:00 23.10 16.972500 268.500000 449.250000 \n", "... ... ... ... ... \n", "2015-02-07 13:56:00 22.70 17.050000 144.750000 442.500000 \n", "2015-02-07 13:57:00 22.70 17.100000 115.000000 443.750000 \n", "2015-02-07 13:58:00 22.65 17.100000 131.000000 445.000000 \n", "2015-02-07 13:59:00 22.60 17.100000 140.000000 443.250000 \n", "2015-02-07 13:59:59 22.60 17.100000 167.666667 444.000000 \n", "\n", " HumidityRatio Occupancy \n", "date \n", "2015-02-07 13:00:00 0.002975 0 \n", "2015-02-07 13:01:00 0.002970 0 \n", "2015-02-07 13:01:59 0.002976 0 \n", "2015-02-07 13:02:59 0.002965 0 \n", "2015-02-07 13:04:00 0.002960 0 \n", "... ... ... \n", "2015-02-07 13:56:00 0.002902 0 \n", "2015-02-07 13:57:00 0.002910 0 \n", "2015-02-07 13:58:00 0.002902 0 \n", "2015-02-07 13:59:00 0.002893 0 \n", "2015-02-07 13:59:59 0.002893 0 \n", "\n", "[61 rows x 6 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "day_set = dataTraining[(dataTraining.index.weekday == 5) & (dataTraining.index.hour == 13)] # Формируем новый датафрейм day_set, в который запишем данные за все субботы в промежуток с 13-00 до 13-59\n", "day_set # Выведем на экран датафрейм day_set\n", "# Значения индексов index.weekday может принимать значения от 0 до 6, где 0 - понедельник, 1 - вторник и т.д.\n", "# Значения индексов index.day может принимать значения от 1 до 31, в соответствии с числом месяца." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 450 }, "id": "TBnaueJ_jAkQ", "outputId": "dcd979d7-c575-4b42-c172-73b294093857" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/wx/b8w5bvpx5hjdgyvqlfp38x940000gn/T/ipykernel_15726/247725088.py:1: FutureWarning: Indexing a DataFrame with a datetimelike index using a single string to slice the rows, like `frame[string]`, is deprecated and will be removed in a future version. Use `frame.loc[string]` instead.\n", " dataTraining['2015-02-06'] # Отобразим данные за 6 февраля 2015 года\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-06 00:00:0020.2021.2900.0438.00.0031100
2015-02-06 00:01:0020.2021.2000.0439.00.0030970
2015-02-06 00:02:0020.2021.2900.0441.50.0031100
2015-02-06 00:03:0020.2021.2900.0444.00.0031100
2015-02-06 00:04:0020.2021.2900.0446.50.0031100
.....................
2015-02-06 23:55:0019.8918.7900.0441.50.0026910
2015-02-06 23:55:5920.0018.7450.0435.00.0027030
2015-02-06 23:57:0020.0018.7000.0441.00.0026960
2015-02-06 23:57:5920.0018.7000.0441.00.0026960
2015-02-06 23:58:5920.0018.7000.0440.00.0026960
\n", "

1440 rows × 6 columns

\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 HumidityRatio \\\n", "date \n", "2015-02-06 00:00:00 20.20 21.290 0.0 438.0 0.003110 \n", "2015-02-06 00:01:00 20.20 21.200 0.0 439.0 0.003097 \n", "2015-02-06 00:02:00 20.20 21.290 0.0 441.5 0.003110 \n", "2015-02-06 00:03:00 20.20 21.290 0.0 444.0 0.003110 \n", "2015-02-06 00:04:00 20.20 21.290 0.0 446.5 0.003110 \n", "... ... ... ... ... ... \n", "2015-02-06 23:55:00 19.89 18.790 0.0 441.5 0.002691 \n", "2015-02-06 23:55:59 20.00 18.745 0.0 435.0 0.002703 \n", "2015-02-06 23:57:00 20.00 18.700 0.0 441.0 0.002696 \n", "2015-02-06 23:57:59 20.00 18.700 0.0 441.0 0.002696 \n", "2015-02-06 23:58:59 20.00 18.700 0.0 440.0 0.002696 \n", "\n", " Occupancy \n", "date \n", "2015-02-06 00:00:00 0 \n", "2015-02-06 00:01:00 0 \n", "2015-02-06 00:02:00 0 \n", "2015-02-06 00:03:00 0 \n", "2015-02-06 00:04:00 0 \n", "... ... \n", "2015-02-06 23:55:00 0 \n", "2015-02-06 23:55:59 0 \n", "2015-02-06 23:57:00 0 \n", "2015-02-06 23:57:59 0 \n", "2015-02-06 23:58:59 0 \n", "\n", "[1440 rows x 6 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining['2015-02-06'] # Отобразим данные за 6 февраля 2015 года" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 142 }, "id": "350rNvXEjAkS", "outputId": "378a259e-82e2-4826-eac0-8db00d00c9d8" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
date
2015-02-08 14:55:0020.29026.285.666667418.6666670.0038530
2015-02-08 14:55:5920.31526.259.250000422.5000000.0038590
\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 \\\n", "date \n", "2015-02-08 14:55:00 20.290 26.2 85.666667 418.666667 \n", "2015-02-08 14:55:59 20.315 26.2 59.250000 422.500000 \n", "\n", " HumidityRatio Occupancy \n", "date \n", "2015-02-08 14:55:00 0.003853 0 \n", "2015-02-08 14:55:59 0.003859 0 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining['2015-02-08 14:55':'2015-02-08 14:56'] # Отобразим данные, начиная с 14:55 и до 14:56 8 февраля 2015 года" ] }, { "cell_type": "markdown", "metadata": { "id": "C8xqSqQujAmh" }, "source": [ "## 6. Встроенные статистические функции\n", "\n", "Кроме уже известных и понятных нам суммы и среднего, есть и другие функции. Остановимся на них подробнее." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "hR8AlUzejAmh" }, "outputs": [], "source": [ "### 1. Корреляция\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "import matplotlib as plt\n", "import seaborn" ] }, { "cell_type": "markdown", "metadata": { "id": "M7H3eXu0jAmi" }, "source": [ "Обычный коэффициент корреляции Пирсона" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "IJLYQkG0jAmj", "outputId": "bb23dcea-6f7d-47d8-9c5f-90e39eb25cfa" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TemperatureHumidityLightCO2HumidityRatioOccupancy
Temperature1.000000-0.1417590.6499420.5598940.1517620.538220
Humidity-0.1417591.0000000.0378280.4390230.9551980.132964
Light0.6499420.0378281.0000000.6640220.2304200.907352
CO20.5598940.4390230.6640221.0000000.6265560.712235
HumidityRatio0.1517620.9551980.2304200.6265561.0000000.300282
Occupancy0.5382200.1329640.9073520.7122350.3002821.000000
\n", "
" ], "text/plain": [ " Temperature Humidity Light CO2 HumidityRatio \\\n", "Temperature 1.000000 -0.141759 0.649942 0.559894 0.151762 \n", "Humidity -0.141759 1.000000 0.037828 0.439023 0.955198 \n", "Light 0.649942 0.037828 1.000000 0.664022 0.230420 \n", "CO2 0.559894 0.439023 0.664022 1.000000 0.626556 \n", "HumidityRatio 0.151762 0.955198 0.230420 0.626556 1.000000 \n", "Occupancy 0.538220 0.132964 0.907352 0.712235 0.300282 \n", "\n", " Occupancy \n", "Temperature 0.538220 \n", "Humidity 0.132964 \n", "Light 0.907352 \n", "CO2 0.712235 \n", "HumidityRatio 0.300282 \n", "Occupancy 1.000000 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataTraining.corr() # Выведем матрицу корреляции для датафрейма dataTraining" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "seaborn.heatmap(dataTraining.corr())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qG6LB7yDjAmx" }, "outputs": [], "source": [ "### 2. Медиана" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "id": "YMbWszYgjAmx", "outputId": "d66c8b8e-f38a-43f3-cfbd-93178a035139" }, "outputs": [], "source": [ "dataTraining.median() # Выведем медиану для датафрейма dataTraining" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yRYKb92DjAm5" }, "outputs": [], "source": [ "### 4. Уникальные значения и их количество" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "id": "BlNFl4MxjAm9", "outputId": "68433809-3603-42f7-cdec-aa4594cfe6f2" }, "outputs": [], "source": [ "dataTraining.nunique() # Выведем количество уникльных значений для всех столбцов датафрейма dataTraining" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "4EKcGjQKjAnA", "outputId": "08915e97-77d3-4b26-e040-aa301943ec15" }, "outputs": [], "source": [ "dataTraining.Occupancy.unique() # Выведем уникальные значения для столбца Occupancy датафрейма dataTraining" ] }, { "cell_type": "markdown", "metadata": { "id": "8F6G2AUnKFCA" }, "source": [ "# Глоссарий\n", "\n", "\n", "pd.DataFrame(данные, columns = [колонки, если есть], index = [индексы ,если есть]) - создать датафрейм\n", "\n", "pd.read_csv(полный адрес расположения файла) - открыть .csv файл\n", "\n", "------------\n", "\n", ".head() - посмотреть верхушку датафрейма (первые n строк)\n", "\n", ".tail() - посмотреть конец датафрейма (последние n строк)\n", "\n", ".columns - список колонок датафрейма\n", "\n", ".values - вывести массив всех значений датафрейма\n", "\n", ".index - список индексов датафрейма\n", "\n", ".tolist() - перевести в список\n", "\n", ".count() - посчитать количество определенных величин во фрейме\n", "\n", ".describe() - посмотреть основные статистические характеристики фрейма\n", "\n", ".shape - форма фрейма (строки, колонки)\n", "\n", ".size - размер фрейма строки*колонки\n", "\n", ".info() - информация о данных каждой колонки\n", "\n", ".dtypes - тип данных каждой колонки\n", "\n", ".isnull() - где недостает значений\n", "\n", ".isna()- есть ли значения None\n", "\n", ".dropna() - выкинуть строки/колонки с None\n", "\n", ".fillna() - заполнить заданным значеним ячейки, где есть None\n", "\n", ".loc[] - вывести значения по названиям колонок\n", "\n", ".iloc[] - вывести значения по индексам колонок\n", "\n", ".drop() - выкинуть определенные значения\n", "\n", "--------------\n", "\n", "pd.to_datetime(колонка, которую переводим в формат временного ряда)\n", "\n", ".groupby() - сгруппировать по конкретному признаку\n", "\n", ".copy() - создать копию\n", "\n", ".sort_values() - сортировка значений\n", "\n", "pd.concat([df1,df2]) - конкатенация фреймов\n", "\n", ".merge(второй_датафрейм, on = 'общая колонка, по которой склеиваем', how = 'с какой стороны') - конкатенация фреймов через общий признак\n", "\n", "-------------\n", "\n", "\n", ".corr() - вычислить корреляцию\n", "\n", ".median() - вычислить медиану\n", "\n", ".cumsum() - вычислить кумулятивную сумму\n", "\n", ".cumprod() - вычислить кумулятивное произведение\n", "\n", ".cummax() - вычислить кумулятивный максимум\n", "\n", "-------------\n", "\n", ".quantile([]) - вычислить квантили\n", "\n", ".nunique() - уникальные значения для n-колонок/строк\n", "\n", ".unique() - уникальные значения определенной колонки/строк\n", "\n", "------------\n", "\n", ".apply(функция) - применить функцию для колонки/строки\n", "\n", ".agg(набор_функций) - применить ряд функций для колонки/строки\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ku19ab06hkRZ" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [ "_mybfuLlhkzU", "0Chntu25WIF2", "Af1_T2N8jAfF", "X6KDR8HPjAf3", "C8xqSqQujAmh", "hR8AlUzejAmh", "qG6LB7yDjAmx", "xzJAnTajjAm3", "yRYKb92DjAm5", "Idy8pVuvjAnD", "G_jD-MltjAnR", "9TdHEC0ajAng", "3ANmDztPdeNy" ], "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 1 }