diff --git a/Hausarbeit/Aufgabenstellung Hausarbeit_mobile_device_data.pdf b/Hausarbeit/Aufgabenstellung Hausarbeit_mobile_device_data.pdf new file mode 100644 index 0000000..d869887 Binary files /dev/null and b/Hausarbeit/Aufgabenstellung Hausarbeit_mobile_device_data.pdf differ diff --git a/Hausarbeit/Beispielcode von mir.md b/Hausarbeit/Beispielcode von mir.md new file mode 100644 index 0000000..206cdf8 --- /dev/null +++ b/Hausarbeit/Beispielcode von mir.md @@ -0,0 +1,84 @@ +``` +# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 +# based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ + +# pip3 install openpyxl +import os +import pandas as pd # To read data +import math as m +import numpy as np +import scipy as sp + +from scipy import stats +import matplotlib.pyplot as plt # To visualize + +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx')) + +df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" +print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) +print(df.head(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 0 NaN NaN NaN NaN NaN NaN +# 1 NaN 6.0 4.0 7.0 4.0 5.0 +# ... +#df = df.dropna() # CAUTION: drops every row that even contains single NaN ! + +print(df.tail(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 155 NaN 4.0 4.0 3.0 5.0 1.0 +# 156 NaN NaN NaN NaN NaN NaN +# (End of File) + +#print(df["HO_Score_Bewerbung_Gewichtet"][105:110]) +#for col in df.columns: + #print(col) + +# Calculate Mean, gew, inv +mwHO01_Diff = df["HO01_Diff"][1:156] # Limit to Column and row Amount +mwHO01_Diff = mwHO01_Diff.mean(skipna=True) # Columns arithm. mean, skipna to ignore NaN rows +mwHO01_Diff = round(mwHO01_Diff, 2) +normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) # Norm +invHO01_Diff = 1 - normHO01_Diff # invert +# usw +print("HO01_Diff Mittelwert:", mwHO01_Diff) +print("HO01_Diff Normiert:", normHO01_Diff) +print("HO01_Diff Invertiert:", invHO01_Diff) +# usw + +# Choose Dataframe Columns and row Amount +dfColumnX = df["SS_Score"][1:156] +dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156] + +# Convert Dataframe Columns to Array containing the X- and Y- Values +arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array +arrY = np.asarray(dfColumnY) + +# Prepare Plot Image +plt.xlabel('SS_Score', color='black') +plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') +plt.xlim([0,50]) # set x-Axis View Range,[from,to] +plt.scatter(arrX, arrY) + +arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync + +# Convert again, as sorting seemed to break the numpy array data format +arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" +arrY = np.asarray(arrY) + +# Use least Square Linear Regression from SciPy Stats +regr_results = sp.stats.linregress(arrX, arrY) + +steigung = regr_results.slope +yAchsAbschn = regr_results.intercept +arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values + +print("y =", steigung, "* x +", yAchsAbschn) + +# Plot Linear Regression Line +plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html +plt.show() +``` \ No newline at end of file diff --git a/Hausarbeit/UnivariateAnalyse.csv b/Hausarbeit/UnivariateAnalyse.csv new file mode 100644 index 0000000..a6ab8e9 --- /dev/null +++ b/Hausarbeit/UnivariateAnalyse.csv @@ -0,0 +1,9 @@ +,battery_power,int_memory,ram +count,200.0,200.0,200.0 +mean,1264.56,33.485,2153.125 +std,441.5502229610862,17.79559470640905,1140.4263724281582 +min,504.0,2.0,263.0 +20%,857.6,16.0,870.8000000000001 +50%,1249.5,33.0,2172.5 +80%,1721.4000000000003,51.0,3317.600000000001 +max,1999.0,64.0,3976.0 diff --git a/Hausarbeit/__pycache__/mobile_device_data.cpython-37.pyc b/Hausarbeit/__pycache__/mobile_device_data.cpython-37.pyc new file mode 100644 index 0000000..1b6bad8 Binary files /dev/null and b/Hausarbeit/__pycache__/mobile_device_data.cpython-37.pyc differ diff --git a/Hausarbeit/mobile_device_data.csv b/Hausarbeit/mobile_device_data.csv new file mode 100644 index 0000000..68458ca --- /dev/null +++ b/Hausarbeit/mobile_device_data.csv @@ -0,0 +1,211 @@ +id,battery_power,bluetooth,dual_sim,4G,int_memory,ram +1,1043,1,1,0,5,3476 +2,841,1,1,1,61,3895 +3,1807,1,0,0,27,2396 +4,1546,0,1,1,25,3893 +5,1434,0,0,1,49,1773 +6,1464,1,1,1,50,3506 +7,1718,0,0,0,47,3873 +8,833,0,1,0,62,1495 +9,1111,1,1,1,25,3485 +10,1520,0,0,0,25,651 +11,1500,0,0,0,55,3866 +12,1343,0,0,1,34,3911 +13,900,1,1,0,30,439 +14,1190,1,1,0,19,992 +15,630,0,0,1,51,2751 +16,1846,1,0,1,53,563 +17,1985,0,1,1,26,2083 +18,1042,0,0,1,48,2187 +19,1231,1,1,1,37,3902 +20,1488,0,0,0,37,2524 +21,968,0,0,1,7,1357 +22,529,0,1,0,60,3456 +23,1558,0,1,0,50,1641 +24,533,1,1,0,58,2322 +25,1037,0,1,0,5,3862 +26,1025,0,1,1,43,557 +27,1858,0,1,0,17,2427 +28,980,0,0,0,8,625 +29,644,1,1,0,15,3836 +30,1024,1,1,1,38,854 +31,1981,0,1,1,11,2383 +32,1380,0,1,1,56,926 +33,1557,1,1,0,32,3048 +34,1201,1,1,0,49,824 +35,1074,1,0,0,45,874 +36,1175,1,0,0,33,1617 +37,1280,0,1,1,8,3309 +38,1715,0,1,1,28,1786 +39,1165,0,0,0,40,3128 +40,567,0,0,0,2,2243 +41,1952,1,0,0,47,994 +42,822,1,1,0,43,2749 +43,685,1,1,0,12,793 +44,1388,1,1,1,17,324 +45,1972,0,0,0,48,366 +46,1411,1,0,1,57,1228 +47,1094,0,1,0,20,1413 +48,1653,0,1,1,13,2617 +49,916,0,1,0,33,1798 +50,1712,0,0,0,29,2693 +51,882,1,1,0,32,2437 +52,632,0,0,1,33,837 +53,1442,1,1,0,56,2815 +54,900,0,0,0,47,1440 +55,1630,1,1,0,20,3470 +56,1596,1,1,0,24,1251 +57,1272,1,0,1,54,3181 +58,1640,0,1,0,7,1361 +59,1111,0,1,0,15,1630 +60,1889,1,0,0,25,3689 +61,1907,1,1,0,22,3586 +62,529,0,1,0,56,3942 +63,578,0,0,1,38,1431 +64,1634,1,1,1,39,2167 +65,1533,1,1,1,22,1248 +66,660,0,0,1,47,1986 +67,1847,1,0,1,28,1378 +68,1206,1,1,0,10,2959 +69,549,1,0,1,53,1893 +70,1705,1,0,1,23,1676 +71,1366,0,1,0,36,1912 +72,1991,0,0,0,58,1947 +73,1102,1,0,1,40,2734 +74,1452,0,0,1,32,3617 +75,1810,1,1,0,63,3414 +76,1166,1,1,1,41,482 +77,881,1,1,0,6,2813 +78,1134,1,1,1,64,661 +79,1031,1,1,1,20,2546 +80,1376,1,1,1,34,3922 +81,1391,0,1,1,56,1199 +82,979,0,0,0,24,3779 +83,1075,1,0,0,36,3870 +84,968,0,1,1,61,858 +85,1999,0,1,1,15,3840 +86,1626,0,0,0,20,454 +87,942,0,1,0,28,3953 +88,1182,0,0,1,42,1633 +89,1982,1,1,0,48,3035 +90,1373,0,1,0,27,966 +91,1151,0,1,0,44,1761 +92,1650,1,1,0,51,1938 +93,1663,1,1,0,17,2820 +94,1965,1,0,1,3,305 +95,679,0,1,0,41,2838 +96,1465,0,0,0,3,2042 +97,1809,1,1,0,27,700 +98,757,1,1,1,26,2593 +99,1034,1,0,1,47,1835 +100,1119,1,0,0,23,3121 +101,559,1,1,1,24,2023 +102,1204,1,0,0,19,737 +103,1008,0,1,1,15,450 +104,1397,1,1,0,19,2928 +105,697,0,1,1,34,1694 +106,1939,1,0,0,58,2593 +107,1039,0,1,0,15,745 +108,1605,1,0,1,51,1310 +109,1094,0,1,0,34,2743 +110,769,1,1,1,30,3976 +111,861,1,1,0,40,3931 +112,504,0,1,0,63,3455 +113,1930,1,1,1,64,1533 +114,1795,1,1,0,52,3876 +115,1363,0,1,1,2,3239 +116,1376,1,0,1,38,3628 +117,1981,1,0,0,19,3663 +118,1901,0,0,0,62,1786 +119,1319,0,1,0,5,3502 +120,859,1,0,0,58,815 +121,1664,1,0,0,25,275 +122,955,0,1,0,62,3758 +123,517,0,1,0,38,2747 +124,1806,0,0,1,64,1809 +125,1348,0,0,1,50,2086 +126,1455,0,0,0,16,444 +127,1611,0,1,NaN,25,3467 +128,1573,1,1,1,24,2776 +129,557,0,0,1,23,3170 +130,1599,1,0,0,31,1234 +131,1051,1,1,0,16,305 +132,1857,1,1,0,14,1745 +133,1986,0,0,0,24,1707 +134,591,1,0,0,21,2220 +135,1140,0,1,1,56,3130 +136,923,1,0,1,10,1500 +137,1582,NaN,0,0,33,2145 +138,723,0,0,0,63,488 +139,1251,1,1,1,54,3863 +140,574,1,1,1,60,2913 +141,948,1,1,1,60,2094 +142,1571,0,1,0,44,3141 +143,564,1,0,0,33,2573 +144,1466,0,1,1,57, +145,597,1,1,0,16,3788 +146,895,1,1,1,9,3445 +147,1535,0,1,1,37,3241 +148,1832,1,0,0,44,2976 +149,1045,1,0,1,58,2241 +150,1483,1,0,1,61,3843 +151,976,0,0,1, ,3261 +152,1840,1,1,0,46,3264 +153,1807,0,1,0,8,826 +154,624,1,1,0,30,1314 +155,1963,0,0,1,38,2699 +156,1307,1,0,1,4,2565 +157,1933,1,1, ,23,3973 +158,1496,1,1,0,42,3537 +159,1532,1,0,0,7,3449 +160,1004,0,1,1,8,3895 +161,1483,1,1,1,38,2777 +162,945,0,0,0,47,2904 +163,1081,NaN,0,1,17,3774 +164,1012,0,1,1,32,3034 +165,1762,0,0,0,50,2940 +166,796,1,1,1,36, +167,1547,0,0,1,50,1168 +168,988,0,0,1,12,376 +169,1180,1,0,1,43,3510 +170,852,1,0,1,3,542 +171,607,1,0,1,18,550 +172,1765,0,0,0,24,791 +173,1250,0,0,0,63,1895 +174,1577,0,1,0,55,609 +175,1153,1,1,0,14,263 +176,651,1,0,0, ,2141 +177,1186,1,1,0,25,1270 +178,1429,1,0,0,33,403 +179,556,0,0,1,10,2040 +180,1735,1,0,1,21,2597 +181,1859,0,0,1,42,714 +182,915,0,0,0,10,423 +183,890,NaN,0,0,9,1238 +184,758,0,0,0,48,926 +185,541,0,0,1,28,2704 +186,586,0,1,1,33,2817 +187,762,1,1,1,39,2271 +188,683,0,1,1,9,1513 +189,1526,0,1,0,39,339 +190,1771,1,0,0,39,794 +191,639,0,0,0,28,391 +192,1783,1,0,1,23,541 +193,1933,0,0,1,55,648 +194,1384,,1,1,62,3386 +195,1770,0,0,0,26,1651 +196,1202,0,0,0,49,1177 +197,885,0,1,0,3,2887 +198,1629,1,0,1,2,2178 +199,1072,1,1,1,4,2878 +200,1863,1,1,0,64,3201 +201,1739,0,1,0,51,1490 +202,895,1,0,0,23,2724 +203,1278,1,1,0,56,3032 +204,562,1,1,1,43,3352 +205,1249,1,1,0,38,3195 +206,1811,0,0,1,25,1677 +207,560,1,1,1,12,2620 +208,1773,1,0,1,61,1061 +209,1715,1,0,0,11,1018 +210,725,0,1,1,26,1370 diff --git a/Hausarbeit/mobile_device_data.py b/Hausarbeit/mobile_device_data.py new file mode 100644 index 0000000..b847d51 --- /dev/null +++ b/Hausarbeit/mobile_device_data.py @@ -0,0 +1,107 @@ +import numpy as np # pip3 install numpy +import pandas as pd # pip3 install pandas +import matplotlib.pyplot as plt +import scipy as sp +from scipy import stats + +### a) Einlesen der Quelldaten +# Pandas-Methode read_csv() zum Einlesen nutzen, wobei die Spalte namens "id" ausgelassen wird +gewuenschteSpalten = ["battery_power", "bluetooth", "dual_sim", "4G", "int_memory", "ram"] +df = pd.read_csv('mobile_device_data.csv', usecols=gewuenschteSpalten) + +# Print inkl. Dataframe-Methode head() aufrufen +# Parameter n: Ausgabe der ersten 12 Zeilen (Zeile 0 bis 11 = 12 Zeilen) +print(df.head(n=12)) +# Ausgabe: +# battery_power bluetooth dual_sim 4G int_memory ram +# 0 1043 1.0 1 0 5 3476 +# ... ... ... ... ... ... ... +# 11 1343 0.0 0 1 34 3911 + +### b) Bereinigen +# Dataframe enthält verschiedene Arten nicht auswertbarer Zellen: +# 1. Zellen in denen vorher schon NaN (NotANumber) steht +# 2. Zellen, in denen ein leerer String steht (' ') +# -> umwandeln von (' ') in "NaN", per DataFrame-Methode replace() +df.replace(to_replace=' ', value=np.nan, inplace=True) + +# 1. und 2. lassen sich jetzt auf die gleiche Weise wie folgt ausgeben, +# und es werden 4 Zeilen gefunden, in denen das Bluetooth Feld "NaN" ist, +# und insgesamt 6 Zeilen, in denen das 4G, int_memory und ram Feld "NaN" sind: +# print(df.isna().sum()) +# Ausgabe: +# battery_power 0 +# bluetooth 4 +# dual_sim 0 +# 4G 2 +# int_memory 2 +# ram 2 + +# Dataframe-Methode "dropna" aufrufen, um solche Einträge zu löschen: +# Parameter axis=0 : Zeile wird gelöscht +# Parameter how='any' : Zum Löschen genügt eine einzelne leere Zelle +# Parameter inplace=True : Die Operation wird direkt auf das DataFrame angewendet +df.dropna(axis=0, how='any', inplace=True) + +# Nach dieser Änderung sind nur noch 200 Zeilen im DataFrame +print(len(df.index)) +# Ausgabe: +# 200 + +### c) Univariate Analyse +# Die Datentypen der gewünschten Merkmale werden nicht von vornherein alle als Ganzzahlen (Integer) +# interpretiert (vgl. print(df.info()) ), daher erst in solche umwandeln. +# Wenn man das nicht macht, kann .describe() nicht ordentlich mit gemischten Spaltentypen umgehen. +# Die Beispielausgabe wäre sonst: "mean 1264.560000 NaN NaN" +dfMetrischeMerkmale = df[["battery_power", "int_memory", "ram"]].astype(int) +dfUnivariateAnalyse = dfMetrischeMerkmale[["battery_power", "int_memory", "ram"]].describe(include='all', percentiles=[0.2, 0.5, 0.8]) +dfUnivariateAnalyse.to_csv('UnivariateAnalyse.csv') +# Ausgabe: +# battery_power int_memory ram +# count 200.000000 200.000000 200.000000 +# mean 1264.560000 33.485000 2153.125000 +# std 441.550223 17.795595 1140.426372 +# min 504.000000 2.000000 263.000000 +# 20% 857.600000 16.000000 870.800000 +# 50% 1249.500000 33.000000 2172.500000 +# 80% 1721.400000 51.000000 3317.600000 +# max 1999.000000 64.000000 3976.000000 + + +### d) Balkendiagramme +dfNominaleMerkmale = df[["bluetooth", "dual_sim", "4G"]].astype(int) +#dfNominaleSummen = dfNominaleMerkmale["bluetooth", "dual_sim", "4G"].sum() +#print(dfNominaleMerkmale) +#dfNominaleMerkmale.plot(kind="hist") +#plt.show() + + + +### e) Korrellationen nach Pearson und Lineare Regression zweier Merkmale +print(dfMetrischeMerkmale.corr(method="pearson")) +# Ausgabe: +# battery_power int_memory ram +# battery_power 1.000000 0.050449 -0.069141 +# int_memory 0.050449 1.000000 0.047475 +# ram -0.069141 0.047475 1.000000 +# -> ram und battery_power korrelieren am Stärksten, wenn auch negativ: + +# Per Modul SciPy Stats: Methode der kleinsten Quadrate für die Lineare Regression nutzen +werteListeX = dfMetrischeMerkmale["ram"] +werteListeY = dfMetrischeMerkmale["battery_power"] +regrErgebnisse = sp.stats.linregress(werteListeX, werteListeY) +steigung = round(regrErgebnisse.slope, 4) +yAchsAbschn = round(regrErgebnisse.intercept, 4) +arrYpredicted = steigung * werteListeX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values +print("Regressionsgleichung:", "y =", steigung, "* x +", yAchsAbschn) + +# Plot Linear Regression Line +plt.plot(werteListeX, arrYpredicted, label='Lin Regression', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html +# Show Plot Image +plt.xlabel('ram', color='black') +plt.ylabel('battery_power', color='black') +#plt.xlim([0,50]) # set x-Axis View Range,[from,to] +plt.scatter(werteListeX, werteListeY) +plt.show() + + diff --git a/Sonstiges/TENTW/auswertung.py b/Sonstiges/TENTW/auswertung.py index b219e42..3aff776 100644 --- a/Sonstiges/TENTW/auswertung.py +++ b/Sonstiges/TENTW/auswertung.py @@ -91,18 +91,21 @@ df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung # Vorlesung 6 df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] +print("Diff_roh_erklaert_Mean", df["Diff_roh_erklaert"].mean()) df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"] df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std() +print("HO_Final mean", df["H0_Final"].mean(), "HO_Final std:", df["H0_Final"].std()) +print("SS_Mean", df["SS_Score"].mean()) df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4 df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2) df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1 df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7 -#df.to_csv(__location__ + '/tmpViewFile.csv', sep=";") +df.to_csv(__location__ + '/tmpViewFile.csv', sep=";") # Open Dataframe in Webbrowser: def showDf(df): @@ -112,4 +115,4 @@ def showDf(df): dfHtml = df.to_html() + style f.write(dfHtml) webbrowser.open(__location__ + "/tmpViewFile.html") -showDf(df) \ No newline at end of file +showDf(df)