Lineare Regression

2021-11-14 17:24:14 +01:00
parent 402383f289
commit ea08ba9b18
7 changed files with 174 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -46,3 +46,7 @@
    - Listen und Arrays
        - ```Uebung1.py```
        - ToDo: ```Uebung2.py```
 # Vorlesung 5
 11.11.2021 
 - ```Vorlesung V.pdf```
--- a/Sonstiges/STAT2/vl2-varianz-v1.py
+++ b/Sonstiges/STAT2/vl2-varianz-v1.py
@@ -1,7 +1,12 @@
 import os
 import pandas as pd
 import numpy as np
-df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl2-varianz-v1.csv')
+# location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_csv(os.path.join(__location__, 'vl2-varianz-v1.csv'))
 # Dataframe
 print(df)
--- a/Sonstiges/STAT2/vl3-standardfehler.py
+++ b/Sonstiges/STAT2/vl3-standardfehler.py
@@ -1,7 +1,12 @@
 import os
 import pandas as pd
 import numpy as np
-df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl3-standardfehler.csv')
+# location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_csv(os.path.join(__location__, 'vl3-standardfehler.csv'))
 # Dataframe
 print(df)
@@ -27,14 +32,28 @@ for index, row in df.iterrows():
    summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2)
    print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen)
 variance = variancePart1 * summeQuadrierteAbweichungen
-print("variance: ", variance)
+print("pop variance: ", variance)
 # √(^σ²)
 standardDev = variance**(1/2) # √n = n^1/2
-print("Standardabweichung: ", standardDev) 
+print("pop Standardabweichung: ", standardDev) 
 # √(ŝd / freq)
 standardfehler = standardDev / sums.freq**(1/2) # √n = n^1/2
 print("Standardfehler des Mittelwerts: ", standardfehler) 
 # "Bonus":
 # Mittelwertsverteilung bei 2 Würfeln
 print()
 import random
--- a/Sonstiges/STAT2/vl4-zufriedenheit.csv
+++ b/Sonstiges/STAT2/vl4-zufriedenheit.csv
@@ -0,0 +1,6 @@
 x,freq
 1,159
 2,500
 3,674
 4,110
 5,21
--- a/Sonstiges/STAT2/vl4-zufriedenheit.py
+++ b/Sonstiges/STAT2/vl4-zufriedenheit.py
@@ -0,0 +1,57 @@
 import pandas as pd
 import numpy as np
 df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl4-zufriedenheit.csv')
 # Dataframe
 print(df)
 print(df.sum())
 sums = df.sum()
 print('Summierte Häufigkeit: ', sums['freq'])
 # Calculate Mean, respecting frequencies
 # Σ(freq*(x - mean)) / freq
 rowSum = 0
 for index, row in df.iterrows():
    rowSum = rowSum + row.x * row.freq
 mean = rowSum / sums.freq
 print("mean: ", mean)
 # Geschätzte Populationsvarianz, unter Beachtung der Häufigkeiten
 # Sample Variance: ^σ² = (1 / freq - 1) * Σ(freq*(x - mean)²)
 variancePart1 = (1 / (sums.freq - 1))
 summeQuadrierteAbweichungen = 0
 for index, row in df.iterrows():    
    summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2)
    print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen)
 variance = variancePart1 * summeQuadrierteAbweichungen
 print("pop variance: ", variance)
 # √(^σ²)
 standardDev = variance**(1/2) # √(^σ²) = ^σ²^1/2
 print("pop Standardabweichung: ", standardDev) 
 # √(ŝd / freq)
 standardfehler = (variance / sums.freq)**(1/2) # √(ŝd / freq)
 print("Standardfehler des Mittelwerts: ", standardfehler) 
 # konf95,5 = mean -+ 2 * standardfehler
 konf955unten = mean - 2 * standardfehler
 konf955oben = mean + 2 * standardfehler
 print("95,5% Konfidenzintervall ", konf955unten, konf955oben) 
 # konf95 = mean -+ 1,96 * standardfehler
 konf95unten = mean - 1.96 * standardfehler
 konf95oben = mean + 1.96 * standardfehler
 print("95% Konfidenzintervall ", konf95unten, konf95oben) 
 # z-Wert = (xi - mean) / standardDev
 # z-Wert von 1,00 Ausgezeichnet
 zwert1 = (1 - mean) / standardDev
 print("zwert1", zwert1)
 # z-Wert von 5,00 Schlecht
 zwert5 = (5 - mean) / standardDev
 print("zwert5", zwert5)
--- a/Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx
+++ b/Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx
--- a/Sonstiges/TENTW/linearregression.py
+++ b/Sonstiges/TENTW/linearregression.py
@@ -0,0 +1,79 @@
 # https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
 #   or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
 # pip3 install openpyxl
 from numpy.matrixlib import defmatrix
 import os
 import pandas as pd  # To read data
 import math as m
 import numpy as np
 import scipy as sp 
 from scipy import stats
 import matplotlib.pyplot as plt  # To visualize
 # location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
 df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
 print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
 print(df.head(10))
 #        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
 # 0      NaN      NaN      NaN      NaN      NaN      NaN   
 # 1      NaN      6.0      4.0      7.0      4.0      5.0   
 # ...
 #df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !
 print(df.tail(10))
 #          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
 # 155      NaN      4.0      4.0      3.0      5.0      1.0  
 # 156      NaN      NaN      NaN      NaN      NaN      NaN 
 # (End of File)
 #print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
 #for col in df.columns:
        #print(col)
 # Calculate Mean, gew, inv
 mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True)     # skipna to ignore NaN rows
 mwHO01_Diff = round(mwHO01_Diff, 2)
 gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)
 invHO01_Diff = 1 - gewHO01_Diff
 # usw
 print("HO01_Diff Mittelwert:", mwHO01_Diff)
 print("HO01_Diff Gewichtet:", gewHO01_Diff)
 print("HO01_Diff Invertiert:", invHO01_Diff)
 # usw
 # Limit Dataframe Column and row Amount
 dfColumnX = df["SS_Score"][1:156]
 dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
 # Convert Dataframe Columns to Array containing the X- and Y- Values
 arrX = np.asarray(dfColumnX)                         # convert that dataframe column to numpy array
 arrY = np.asarray(dfColumnY)                         
 # Prepare Plot Image
 plt.xlabel('SS_Score', color='black')
 plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
 plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
 plt.scatter(arrX, arrY)
 arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 
 # Convert again, as sorting seemed break the numpy array data format
 arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
 arrY = np.asarray(arrY)
 # Use least Square Linear Regression from SciPy Stats
 regr_results = sp.stats.linregress(arrX, arrY)   
 steigung = regr_results.slope
 yAchsAbschn = regr_results.intercept
 arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
 print("y =", steigung, "* x +", yAchsAbschn)
 # Plot Linear Regression Line
 plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
 plt.show()
+x,freq
+,159
+,500
+,674
+,110
+,21