Lineare Regression
This commit is contained in:
@@ -46,3 +46,7 @@
|
|||||||
- Listen und Arrays
|
- Listen und Arrays
|
||||||
- ```Uebung1.py```
|
- ```Uebung1.py```
|
||||||
- ToDo: ```Uebung2.py```
|
- ToDo: ```Uebung2.py```
|
||||||
|
|
||||||
|
# Vorlesung 5
|
||||||
|
11.11.2021
|
||||||
|
- ```Vorlesung V.pdf```
|
||||||
@@ -1,7 +1,12 @@
|
|||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl2-varianz-v1.csv')
|
# location will help to open files in the same directory as the py-script
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
df = pd.read_csv(os.path.join(__location__, 'vl2-varianz-v1.csv'))
|
||||||
|
|
||||||
# Dataframe
|
# Dataframe
|
||||||
print(df)
|
print(df)
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl3-standardfehler.csv')
|
# location will help to open files in the same directory as the py-script
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
df = pd.read_csv(os.path.join(__location__, 'vl3-standardfehler.csv'))
|
||||||
|
|
||||||
# Dataframe
|
# Dataframe
|
||||||
print(df)
|
print(df)
|
||||||
@@ -27,14 +32,28 @@ for index, row in df.iterrows():
|
|||||||
summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2)
|
summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2)
|
||||||
print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen)
|
print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen)
|
||||||
variance = variancePart1 * summeQuadrierteAbweichungen
|
variance = variancePart1 * summeQuadrierteAbweichungen
|
||||||
print("variance: ", variance)
|
print("pop variance: ", variance)
|
||||||
|
|
||||||
|
# √(^σ²)
|
||||||
standardDev = variance**(1/2) # √n = n^1/2
|
standardDev = variance**(1/2) # √n = n^1/2
|
||||||
print("Standardabweichung: ", standardDev)
|
print("pop Standardabweichung: ", standardDev)
|
||||||
|
|
||||||
|
# √(ŝd / freq)
|
||||||
standardfehler = standardDev / sums.freq**(1/2) # √n = n^1/2
|
standardfehler = standardDev / sums.freq**(1/2) # √n = n^1/2
|
||||||
print("Standardfehler des Mittelwerts: ", standardfehler)
|
print("Standardfehler des Mittelwerts: ", standardfehler)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# "Bonus":
|
||||||
|
|
||||||
# Mittelwertsverteilung bei 2 Würfeln
|
# Mittelwertsverteilung bei 2 Würfeln
|
||||||
print()
|
print()
|
||||||
import random
|
import random
|
||||||
|
|||||||
6
Sonstiges/STAT2/vl4-zufriedenheit.csv
Normal file
6
Sonstiges/STAT2/vl4-zufriedenheit.csv
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
x,freq
|
||||||
|
1,159
|
||||||
|
2,500
|
||||||
|
3,674
|
||||||
|
4,110
|
||||||
|
5,21
|
||||||
|
57
Sonstiges/STAT2/vl4-zufriedenheit.py
Normal file
57
Sonstiges/STAT2/vl4-zufriedenheit.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl4-zufriedenheit.csv')
|
||||||
|
|
||||||
|
# Dataframe
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
print(df.sum())
|
||||||
|
|
||||||
|
sums = df.sum()
|
||||||
|
print('Summierte Häufigkeit: ', sums['freq'])
|
||||||
|
|
||||||
|
|
||||||
|
# Calculate Mean, respecting frequencies
|
||||||
|
# Σ(freq*(x - mean)) / freq
|
||||||
|
rowSum = 0
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
rowSum = rowSum + row.x * row.freq
|
||||||
|
mean = rowSum / sums.freq
|
||||||
|
print("mean: ", mean)
|
||||||
|
|
||||||
|
# Geschätzte Populationsvarianz, unter Beachtung der Häufigkeiten
|
||||||
|
# Sample Variance: ^σ² = (1 / freq - 1) * Σ(freq*(x - mean)²)
|
||||||
|
variancePart1 = (1 / (sums.freq - 1))
|
||||||
|
summeQuadrierteAbweichungen = 0
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2)
|
||||||
|
print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen)
|
||||||
|
variance = variancePart1 * summeQuadrierteAbweichungen
|
||||||
|
print("pop variance: ", variance)
|
||||||
|
|
||||||
|
# √(^σ²)
|
||||||
|
standardDev = variance**(1/2) # √(^σ²) = ^σ²^1/2
|
||||||
|
print("pop Standardabweichung: ", standardDev)
|
||||||
|
|
||||||
|
# √(ŝd / freq)
|
||||||
|
standardfehler = (variance / sums.freq)**(1/2) # √(ŝd / freq)
|
||||||
|
print("Standardfehler des Mittelwerts: ", standardfehler)
|
||||||
|
|
||||||
|
# konf95,5 = mean -+ 2 * standardfehler
|
||||||
|
konf955unten = mean - 2 * standardfehler
|
||||||
|
konf955oben = mean + 2 * standardfehler
|
||||||
|
print("95,5% Konfidenzintervall ", konf955unten, konf955oben)
|
||||||
|
|
||||||
|
# konf95 = mean -+ 1,96 * standardfehler
|
||||||
|
konf95unten = mean - 1.96 * standardfehler
|
||||||
|
konf95oben = mean + 1.96 * standardfehler
|
||||||
|
print("95% Konfidenzintervall ", konf95unten, konf95oben)
|
||||||
|
|
||||||
|
# z-Wert = (xi - mean) / standardDev
|
||||||
|
# z-Wert von 1,00 Ausgezeichnet
|
||||||
|
zwert1 = (1 - mean) / standardDev
|
||||||
|
print("zwert1", zwert1)
|
||||||
|
# z-Wert von 5,00 Schlecht
|
||||||
|
zwert5 = (5 - mean) / standardDev
|
||||||
|
print("zwert5", zwert5)
|
||||||
BIN
Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx
Executable file
BIN
Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx
Executable file
Binary file not shown.
79
Sonstiges/TENTW/linearregression.py
Normal file
79
Sonstiges/TENTW/linearregression.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
|
||||||
|
# or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
|
||||||
|
# pip3 install openpyxl
|
||||||
|
from numpy.matrixlib import defmatrix
|
||||||
|
import os
|
||||||
|
import pandas as pd # To read data
|
||||||
|
import math as m
|
||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
from scipy import stats
|
||||||
|
import matplotlib.pyplot as plt # To visualize
|
||||||
|
|
||||||
|
# location will help to open files in the same directory as the py-script
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
|
||||||
|
|
||||||
|
df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
|
||||||
|
print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
|
||||||
|
print(df.head(10))
|
||||||
|
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
||||||
|
# 0 NaN NaN NaN NaN NaN NaN
|
||||||
|
# 1 NaN 6.0 4.0 7.0 4.0 5.0
|
||||||
|
# ...
|
||||||
|
#df = df.dropna() # CAUTION: drops every row that even contains single NaN !
|
||||||
|
|
||||||
|
print(df.tail(10))
|
||||||
|
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
||||||
|
# 155 NaN 4.0 4.0 3.0 5.0 1.0
|
||||||
|
# 156 NaN NaN NaN NaN NaN NaN
|
||||||
|
# (End of File)
|
||||||
|
|
||||||
|
#print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
|
||||||
|
#for col in df.columns:
|
||||||
|
#print(col)
|
||||||
|
|
||||||
|
# Calculate Mean, gew, inv
|
||||||
|
mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True) # skipna to ignore NaN rows
|
||||||
|
mwHO01_Diff = round(mwHO01_Diff, 2)
|
||||||
|
gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)
|
||||||
|
invHO01_Diff = 1 - gewHO01_Diff
|
||||||
|
# usw
|
||||||
|
print("HO01_Diff Mittelwert:", mwHO01_Diff)
|
||||||
|
print("HO01_Diff Gewichtet:", gewHO01_Diff)
|
||||||
|
print("HO01_Diff Invertiert:", invHO01_Diff)
|
||||||
|
# usw
|
||||||
|
|
||||||
|
# Limit Dataframe Column and row Amount
|
||||||
|
dfColumnX = df["SS_Score"][1:156]
|
||||||
|
dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
|
||||||
|
|
||||||
|
# Convert Dataframe Columns to Array containing the X- and Y- Values
|
||||||
|
arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array
|
||||||
|
arrY = np.asarray(dfColumnY)
|
||||||
|
|
||||||
|
# Prepare Plot Image
|
||||||
|
plt.xlabel('SS_Score', color='black')
|
||||||
|
plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
|
||||||
|
plt.xlim([0,50]) # set x-Axis View Range,[from,to]
|
||||||
|
plt.scatter(arrX, arrY)
|
||||||
|
|
||||||
|
arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync
|
||||||
|
|
||||||
|
# Convert again, as sorting seemed break the numpy array data format
|
||||||
|
arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]"
|
||||||
|
arrY = np.asarray(arrY)
|
||||||
|
|
||||||
|
# Use least Square Linear Regression from SciPy Stats
|
||||||
|
regr_results = sp.stats.linregress(arrX, arrY)
|
||||||
|
steigung = regr_results.slope
|
||||||
|
yAchsAbschn = regr_results.intercept
|
||||||
|
arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
|
||||||
|
|
||||||
|
print("y =", steigung, "* x +", yAchsAbschn)
|
||||||
|
|
||||||
|
# Plot Linear Regression Line
|
||||||
|
plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
|
||||||
|
plt.show()
|
||||||
Reference in New Issue
Block a user