Code fuer Hausarbeit a-c,e

2022-01-22 17:08:15 +01:00
parent b3e4a699e4
commit 14f7b483f9
7 changed files with 416 additions and 2 deletions
--- a/Hausarbeit_mobile_device_data.pdf
+++ b/Hausarbeit_mobile_device_data.pdf
--- a/Hausarbeit/Beispielcode
+++ b/Hausarbeit/Beispielcode
@@ -0,0 +1,84 @@
 ```
 # https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
 #   based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
 # pip3 install openpyxl
 import os
 import pandas as pd  # To read data
 import math as m
 import numpy as np
 import scipy as sp 
 from scipy import stats
 import matplotlib.pyplot as plt  # To visualize
 # location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
 df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
 print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
 print(df.head(10))
 #        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
 # 0      NaN      NaN      NaN      NaN      NaN      NaN   
 # 1      NaN      6.0      4.0      7.0      4.0      5.0   
 # ...
 #df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !
 print(df.tail(10))
 #          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
 # 155      NaN      4.0      4.0      3.0      5.0      1.0  
 # 156      NaN      NaN      NaN      NaN      NaN      NaN 
 # (End of File)
 #print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
 #for col in df.columns:
        #print(col)
 # Calculate Mean, gew, inv
 mwHO01_Diff = df["HO01_Diff"][1:156]            # Limit to Column and row Amount
 mwHO01_Diff = mwHO01_Diff.mean(skipna=True)     # Columns arithm. mean, skipna to ignore NaN rows
 mwHO01_Diff = round(mwHO01_Diff, 2)
 normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)    # Norm
 invHO01_Diff = 1 - normHO01_Diff                # invert
 # usw
 print("HO01_Diff Mittelwert:", mwHO01_Diff)
 print("HO01_Diff Normiert:", normHO01_Diff)
 print("HO01_Diff Invertiert:", invHO01_Diff)
 # usw
 # Choose Dataframe Columns and row Amount
 dfColumnX = df["SS_Score"][1:156]
 dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
 # Convert Dataframe Columns to Array containing the X- and Y- Values
 arrX = np.asarray(dfColumnX)                         # convert that dataframe column to numpy array
 arrY = np.asarray(dfColumnY)                         
 # Prepare Plot Image
 plt.xlabel('SS_Score', color='black')
 plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
 plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
 plt.scatter(arrX, arrY)
 arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 
 # Convert again, as sorting seemed to break the numpy array data format
 arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
 arrY = np.asarray(arrY)
 # Use least Square Linear Regression from SciPy Stats
 regr_results = sp.stats.linregress(arrX, arrY)   
 steigung = regr_results.slope
 yAchsAbschn = regr_results.intercept
 arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
 print("y =", steigung, "* x +", yAchsAbschn)
 # Plot Linear Regression Line
 plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
 plt.show()
 ```
--- a/Hausarbeit/UnivariateAnalyse.csv
+++ b/Hausarbeit/UnivariateAnalyse.csv
@@ -0,0 +1,9 @@
 ,battery_power,int_memory,ram
 count,200.0,200.0,200.0
 mean,1264.56,33.485,2153.125
 std,441.5502229610862,17.79559470640905,1140.4263724281582
 min,504.0,2.0,263.0
 20%,857.6,16.0,870.8000000000001
 50%,1249.5,33.0,2172.5
 80%,1721.4000000000003,51.0,3317.600000000001
 max,1999.0,64.0,3976.0
--- a/Hausarbeit/pycache/mobile_device_data.cpython-37.pyc
+++ b/Hausarbeit/pycache/mobile_device_data.cpython-37.pyc
--- a/Hausarbeit/mobile_device_data.csv
+++ b/Hausarbeit/mobile_device_data.csv
@@ -0,0 +1,211 @@
 id,battery_power,bluetooth,dual_sim,4G,int_memory,ram
 1,1043,1,1,0,5,3476
 2,841,1,1,1,61,3895
 3,1807,1,0,0,27,2396
 4,1546,0,1,1,25,3893
 5,1434,0,0,1,49,1773
 6,1464,1,1,1,50,3506
 7,1718,0,0,0,47,3873
 8,833,0,1,0,62,1495
 9,1111,1,1,1,25,3485
 10,1520,0,0,0,25,651
 11,1500,0,0,0,55,3866
 12,1343,0,0,1,34,3911
 13,900,1,1,0,30,439
 14,1190,1,1,0,19,992
 15,630,0,0,1,51,2751
 16,1846,1,0,1,53,563
 17,1985,0,1,1,26,2083
 18,1042,0,0,1,48,2187
 19,1231,1,1,1,37,3902
 20,1488,0,0,0,37,2524
 21,968,0,0,1,7,1357
 22,529,0,1,0,60,3456
 23,1558,0,1,0,50,1641
 24,533,1,1,0,58,2322
 25,1037,0,1,0,5,3862
 26,1025,0,1,1,43,557
 27,1858,0,1,0,17,2427
 28,980,0,0,0,8,625
 29,644,1,1,0,15,3836
 30,1024,1,1,1,38,854
 31,1981,0,1,1,11,2383
 32,1380,0,1,1,56,926
 33,1557,1,1,0,32,3048
 34,1201,1,1,0,49,824
 35,1074,1,0,0,45,874
 36,1175,1,0,0,33,1617
 37,1280,0,1,1,8,3309
 38,1715,0,1,1,28,1786
 39,1165,0,0,0,40,3128
 40,567,0,0,0,2,2243
 41,1952,1,0,0,47,994
 42,822,1,1,0,43,2749
 43,685,1,1,0,12,793
 44,1388,1,1,1,17,324
 45,1972,0,0,0,48,366
 46,1411,1,0,1,57,1228
 47,1094,0,1,0,20,1413
 48,1653,0,1,1,13,2617
 49,916,0,1,0,33,1798
 50,1712,0,0,0,29,2693
 51,882,1,1,0,32,2437
 52,632,0,0,1,33,837
 53,1442,1,1,0,56,2815
 54,900,0,0,0,47,1440
 55,1630,1,1,0,20,3470
 56,1596,1,1,0,24,1251
 57,1272,1,0,1,54,3181
 58,1640,0,1,0,7,1361
 59,1111,0,1,0,15,1630
 60,1889,1,0,0,25,3689
 61,1907,1,1,0,22,3586
 62,529,0,1,0,56,3942
 63,578,0,0,1,38,1431
 64,1634,1,1,1,39,2167
 65,1533,1,1,1,22,1248
 66,660,0,0,1,47,1986
 67,1847,1,0,1,28,1378
 68,1206,1,1,0,10,2959
 69,549,1,0,1,53,1893
 70,1705,1,0,1,23,1676
 71,1366,0,1,0,36,1912
 72,1991,0,0,0,58,1947
 73,1102,1,0,1,40,2734
 74,1452,0,0,1,32,3617
 75,1810,1,1,0,63,3414
 76,1166,1,1,1,41,482
 77,881,1,1,0,6,2813
 78,1134,1,1,1,64,661
 79,1031,1,1,1,20,2546
 80,1376,1,1,1,34,3922
 81,1391,0,1,1,56,1199
 82,979,0,0,0,24,3779
 83,1075,1,0,0,36,3870
 84,968,0,1,1,61,858
 85,1999,0,1,1,15,3840
 86,1626,0,0,0,20,454
 87,942,0,1,0,28,3953
 88,1182,0,0,1,42,1633
 89,1982,1,1,0,48,3035
 90,1373,0,1,0,27,966
 91,1151,0,1,0,44,1761
 92,1650,1,1,0,51,1938
 93,1663,1,1,0,17,2820
 94,1965,1,0,1,3,305
 95,679,0,1,0,41,2838
 96,1465,0,0,0,3,2042
 97,1809,1,1,0,27,700
 98,757,1,1,1,26,2593
 99,1034,1,0,1,47,1835
 100,1119,1,0,0,23,3121
 101,559,1,1,1,24,2023
 102,1204,1,0,0,19,737
 103,1008,0,1,1,15,450
 104,1397,1,1,0,19,2928
 105,697,0,1,1,34,1694
 106,1939,1,0,0,58,2593
 107,1039,0,1,0,15,745
 108,1605,1,0,1,51,1310
 109,1094,0,1,0,34,2743
 110,769,1,1,1,30,3976
 111,861,1,1,0,40,3931
 112,504,0,1,0,63,3455
 113,1930,1,1,1,64,1533
 114,1795,1,1,0,52,3876
 115,1363,0,1,1,2,3239
 116,1376,1,0,1,38,3628
 117,1981,1,0,0,19,3663
 118,1901,0,0,0,62,1786
 119,1319,0,1,0,5,3502
 120,859,1,0,0,58,815
 121,1664,1,0,0,25,275
 122,955,0,1,0,62,3758
 123,517,0,1,0,38,2747
 124,1806,0,0,1,64,1809
 125,1348,0,0,1,50,2086
 126,1455,0,0,0,16,444
 127,1611,0,1,NaN,25,3467
 128,1573,1,1,1,24,2776
 129,557,0,0,1,23,3170
 130,1599,1,0,0,31,1234
 131,1051,1,1,0,16,305
 132,1857,1,1,0,14,1745
 133,1986,0,0,0,24,1707
 134,591,1,0,0,21,2220
 135,1140,0,1,1,56,3130
 136,923,1,0,1,10,1500
 137,1582,NaN,0,0,33,2145
 138,723,0,0,0,63,488
 139,1251,1,1,1,54,3863
 140,574,1,1,1,60,2913
 141,948,1,1,1,60,2094
 142,1571,0,1,0,44,3141
 143,564,1,0,0,33,2573
 144,1466,0,1,1,57, 
 145,597,1,1,0,16,3788
 146,895,1,1,1,9,3445
 147,1535,0,1,1,37,3241
 148,1832,1,0,0,44,2976
 149,1045,1,0,1,58,2241
 150,1483,1,0,1,61,3843
 151,976,0,0,1, ,3261
 152,1840,1,1,0,46,3264
 153,1807,0,1,0,8,826
 154,624,1,1,0,30,1314
 155,1963,0,0,1,38,2699
 156,1307,1,0,1,4,2565
 157,1933,1,1, ,23,3973
 158,1496,1,1,0,42,3537
 159,1532,1,0,0,7,3449
 160,1004,0,1,1,8,3895
 161,1483,1,1,1,38,2777
 162,945,0,0,0,47,2904
 163,1081,NaN,0,1,17,3774
 164,1012,0,1,1,32,3034
 165,1762,0,0,0,50,2940
 166,796,1,1,1,36, 
 167,1547,0,0,1,50,1168
 168,988,0,0,1,12,376
 169,1180,1,0,1,43,3510
 170,852,1,0,1,3,542
 171,607,1,0,1,18,550
 172,1765,0,0,0,24,791
 173,1250,0,0,0,63,1895
 174,1577,0,1,0,55,609
 175,1153,1,1,0,14,263
 176,651,1,0,0, ,2141
 177,1186,1,1,0,25,1270
 178,1429,1,0,0,33,403
 179,556,0,0,1,10,2040
 180,1735,1,0,1,21,2597
 181,1859,0,0,1,42,714
 182,915,0,0,0,10,423
 183,890,NaN,0,0,9,1238
 184,758,0,0,0,48,926
 185,541,0,0,1,28,2704
 186,586,0,1,1,33,2817
 187,762,1,1,1,39,2271
 188,683,0,1,1,9,1513
 189,1526,0,1,0,39,339
 190,1771,1,0,0,39,794
 191,639,0,0,0,28,391
 192,1783,1,0,1,23,541
 193,1933,0,0,1,55,648
 194,1384,,1,1,62,3386
 195,1770,0,0,0,26,1651
 196,1202,0,0,0,49,1177
 197,885,0,1,0,3,2887
 198,1629,1,0,1,2,2178
 199,1072,1,1,1,4,2878
 200,1863,1,1,0,64,3201
 201,1739,0,1,0,51,1490
 202,895,1,0,0,23,2724
 203,1278,1,1,0,56,3032
 204,562,1,1,1,43,3352
 205,1249,1,1,0,38,3195
 206,1811,0,0,1,25,1677
 207,560,1,1,1,12,2620
 208,1773,1,0,1,61,1061
 209,1715,1,0,0,11,1018
 210,725,0,1,1,26,1370
--- a/Hausarbeit/mobile_device_data.py
+++ b/Hausarbeit/mobile_device_data.py
@@ -0,0 +1,107 @@
 import numpy as np  # pip3 install numpy
 import pandas as pd # pip3 install pandas
 import matplotlib.pyplot as plt
 import scipy as sp 
 from scipy import stats
 ### a) Einlesen der Quelldaten
 # Pandas-Methode read_csv() zum Einlesen nutzen, wobei die Spalte namens "id" ausgelassen wird
 gewuenschteSpalten = ["battery_power", "bluetooth", "dual_sim", "4G", "int_memory", "ram"]
 df = pd.read_csv('mobile_device_data.csv', usecols=gewuenschteSpalten)
 # Print inkl. Dataframe-Methode head() aufrufen
 #   Parameter n: Ausgabe der ersten 12 Zeilen (Zeile 0 bis 11 = 12 Zeilen)
 print(df.head(n=12))
 # Ausgabe:
 #    battery_power  bluetooth  dual_sim 4G int_memory   ram
 # 0            1043        1.0         1  0          5  3476
 # ...		...			...			...	...		...		...
 # 11           1343        0.0         0  1         34  3911
 ### b) Bereinigen
 # Dataframe enthält verschiedene Arten nicht auswertbarer Zellen:
 # 1. Zellen in denen vorher schon NaN (NotANumber) steht
 # 2. Zellen, in  denen ein leerer String steht (' ') 
 #   -> umwandeln von (' ') in "NaN", per DataFrame-Methode replace()
 df.replace(to_replace=' ', value=np.nan, inplace=True)
 # 1. und 2. lassen sich jetzt auf die gleiche Weise wie folgt ausgeben,
 # und es werden 4 Zeilen gefunden, in denen das Bluetooth Feld "NaN" ist,
 # und insgesamt 6 Zeilen, in denen das 4G, int_memory und ram Feld "NaN" sind:
 # print(df.isna().sum())
 # Ausgabe:
 # battery_power    0
 # bluetooth        4
 # dual_sim         0
 # 4G               2
 # int_memory       2
 # ram              2
 # Dataframe-Methode "dropna" aufrufen, um solche Einträge zu löschen:
 #   Parameter axis=0        : Zeile wird gelöscht
 #   Parameter how='any'     : Zum Löschen genügt eine einzelne leere Zelle
 #   Parameter inplace=True  : Die Operation wird direkt auf das DataFrame angewendet
 df.dropna(axis=0, how='any', inplace=True)
 # Nach dieser Änderung sind nur noch 200 Zeilen im DataFrame
 print(len(df.index))
 # Ausgabe:
 # 200
 ### c) Univariate Analyse
 # Die Datentypen der gewünschten Merkmale werden nicht von vornherein alle als Ganzzahlen (Integer) 
 # interpretiert (vgl. print(df.info()) ), daher erst in solche umwandeln. 
 # Wenn man das nicht macht, kann .describe() nicht ordentlich mit gemischten Spaltentypen umgehen. 
 # Die Beispielausgabe wäre sonst: "mean      1264.560000        NaN   NaN"
 dfMetrischeMerkmale = df[["battery_power", "int_memory", "ram"]].astype(int)
 dfUnivariateAnalyse = dfMetrischeMerkmale[["battery_power", "int_memory", "ram"]].describe(include='all', percentiles=[0.2, 0.5, 0.8])
 dfUnivariateAnalyse.to_csv('UnivariateAnalyse.csv')
 # Ausgabe:
 #       battery_power  int_memory          ram
 # count     200.000000  200.000000   200.000000
 # mean     1264.560000   33.485000  2153.125000
 # std       441.550223   17.795595  1140.426372
 # min       504.000000    2.000000   263.000000
 # 20%       857.600000   16.000000   870.800000
 # 50%      1249.500000   33.000000  2172.500000
 # 80%      1721.400000   51.000000  3317.600000
 # max      1999.000000   64.000000  3976.000000
 ### d) Balkendiagramme
 dfNominaleMerkmale = df[["bluetooth", "dual_sim", "4G"]].astype(int)
 #dfNominaleSummen = dfNominaleMerkmale["bluetooth", "dual_sim", "4G"].sum()
 #print(dfNominaleMerkmale)
 #dfNominaleMerkmale.plot(kind="hist")
 #plt.show()
 ### e) Korrellationen nach Pearson und Lineare Regression zweier Merkmale
 print(dfMetrischeMerkmale.corr(method="pearson"))
 # Ausgabe:
 #                battery_power  int_memory       ram
 # battery_power       1.000000    0.050449 -0.069141
 # int_memory          0.050449    1.000000  0.047475
 # ram                -0.069141    0.047475  1.000000
 # -> ram und battery_power korrelieren am Stärksten, wenn auch negativ:
 # Per Modul SciPy Stats: Methode der kleinsten Quadrate für die Lineare Regression nutzen
 werteListeX = dfMetrischeMerkmale["ram"]
 werteListeY = dfMetrischeMerkmale["battery_power"]
 regrErgebnisse = sp.stats.linregress(werteListeX, werteListeY)   
 steigung = round(regrErgebnisse.slope, 4)
 yAchsAbschn = round(regrErgebnisse.intercept, 4)
 arrYpredicted = steigung * werteListeX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
 print("Regressionsgleichung:", "y =", steigung, "* x +", yAchsAbschn)
 # Plot Linear Regression Line
 plt.plot(werteListeX, arrYpredicted, label='Lin Regression', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
 # Show Plot Image
 plt.xlabel('ram', color='black')
 plt.ylabel('battery_power', color='black')
 #plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
 plt.scatter(werteListeX, werteListeY)
 plt.show()
--- a/Sonstiges/TENTW/auswertung.py
+++ b/Sonstiges/TENTW/auswertung.py
@@ -91,18 +91,21 @@ df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung
 # Vorlesung 6
 df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] 
 print("Diff_roh_erklaert_Mean", df["Diff_roh_erklaert"].mean())
 df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen
 df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"]
 df["H0_Wert_Z_Wert"] =  (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std()
 print("HO_Final mean", df["H0_Final"].mean(), "HO_Final std:", df["H0_Final"].std())
 print("SS_Mean", df["SS_Score"].mean())
 df["H0_Wert_7er_Skala"] =  (df["H0_Wert_Z_Wert"] * 1.5) + 4
 df["H0_Wert_7er_Skala"] =  round(df["H0_Wert_7er_Skala"], 2)
 df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1
 df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7
-#df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")
+df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")
 # Open Dataframe in Webbrowser:
 def showDf(df):