diff --git a/README.md b/README.md index 15e9374..674b4e2 100644 --- a/README.md +++ b/README.md @@ -46,3 +46,7 @@ - Listen und Arrays - ```Uebung1.py``` - ToDo: ```Uebung2.py``` + +# Vorlesung 5 +11.11.2021 +- ```Vorlesung V.pdf``` \ No newline at end of file diff --git a/Sonstiges/STAT2/vl2-varianz-v1.py b/Sonstiges/STAT2/vl2-varianz-v1.py index 51024cd..60e8f0b 100644 --- a/Sonstiges/STAT2/vl2-varianz-v1.py +++ b/Sonstiges/STAT2/vl2-varianz-v1.py @@ -1,7 +1,12 @@ +import os import pandas as pd import numpy as np -df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl2-varianz-v1.csv') +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +df = pd.read_csv(os.path.join(__location__, 'vl2-varianz-v1.csv')) # Dataframe print(df) diff --git a/Sonstiges/STAT2/vl3-standardfehler.py b/Sonstiges/STAT2/vl3-standardfehler.py index 04a9585..fc9a557 100644 --- a/Sonstiges/STAT2/vl3-standardfehler.py +++ b/Sonstiges/STAT2/vl3-standardfehler.py @@ -1,7 +1,12 @@ +import os import pandas as pd import numpy as np -df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl3-standardfehler.csv') +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +df = pd.read_csv(os.path.join(__location__, 'vl3-standardfehler.csv')) # Dataframe print(df) @@ -27,14 +32,28 @@ for index, row in df.iterrows(): summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2) print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen) variance = variancePart1 * summeQuadrierteAbweichungen -print("variance: ", variance) +print("pop variance: ", variance) +# √(^σ²) standardDev = variance**(1/2) # √n = n^1/2 -print("Standardabweichung: ", standardDev) +print("pop Standardabweichung: ", standardDev) +# √(ŝd / freq) standardfehler = standardDev / sums.freq**(1/2) # √n = n^1/2 print("Standardfehler des Mittelwerts: ", standardfehler) + + + + + + + + + + +# "Bonus": + # Mittelwertsverteilung bei 2 Würfeln print() import random diff --git a/Sonstiges/STAT2/vl4-zufriedenheit.csv b/Sonstiges/STAT2/vl4-zufriedenheit.csv new file mode 100644 index 0000000..cc16aed --- /dev/null +++ b/Sonstiges/STAT2/vl4-zufriedenheit.csv @@ -0,0 +1,6 @@ +x,freq +1,159 +2,500 +3,674 +4,110 +5,21 \ No newline at end of file diff --git a/Sonstiges/STAT2/vl4-zufriedenheit.py b/Sonstiges/STAT2/vl4-zufriedenheit.py new file mode 100644 index 0000000..675e5ad --- /dev/null +++ b/Sonstiges/STAT2/vl4-zufriedenheit.py @@ -0,0 +1,57 @@ +import pandas as pd +import numpy as np + +df = pd.read_csv('/home/pi/Documents/Code/Python/ProgrammierungUndDatenanalyse/Sonstiges/STAT2/vl4-zufriedenheit.csv') + +# Dataframe +print(df) + +print(df.sum()) + +sums = df.sum() +print('Summierte Häufigkeit: ', sums['freq']) + + +# Calculate Mean, respecting frequencies +# Σ(freq*(x - mean)) / freq +rowSum = 0 +for index, row in df.iterrows(): + rowSum = rowSum + row.x * row.freq +mean = rowSum / sums.freq +print("mean: ", mean) + +# Geschätzte Populationsvarianz, unter Beachtung der Häufigkeiten +# Sample Variance: ^σ² = (1 / freq - 1) * Σ(freq*(x - mean)²) +variancePart1 = (1 / (sums.freq - 1)) +summeQuadrierteAbweichungen = 0 +for index, row in df.iterrows(): + summeQuadrierteAbweichungen = summeQuadrierteAbweichungen + (row.freq * (row.x - mean)**2) + print(row['x'], row['freq'], 'summe²abweichungen: ', summeQuadrierteAbweichungen) +variance = variancePart1 * summeQuadrierteAbweichungen +print("pop variance: ", variance) + +# √(^σ²) +standardDev = variance**(1/2) # √(^σ²) = ^σ²^1/2 +print("pop Standardabweichung: ", standardDev) + +# √(ŝd / freq) +standardfehler = (variance / sums.freq)**(1/2) # √(ŝd / freq) +print("Standardfehler des Mittelwerts: ", standardfehler) + +# konf95,5 = mean -+ 2 * standardfehler +konf955unten = mean - 2 * standardfehler +konf955oben = mean + 2 * standardfehler +print("95,5% Konfidenzintervall ", konf955unten, konf955oben) + +# konf95 = mean -+ 1,96 * standardfehler +konf95unten = mean - 1.96 * standardfehler +konf95oben = mean + 1.96 * standardfehler +print("95% Konfidenzintervall ", konf95unten, konf95oben) + +# z-Wert = (xi - mean) / standardDev +# z-Wert von 1,00 Ausgezeichnet +zwert1 = (1 - mean) / standardDev +print("zwert1", zwert1) +# z-Wert von 5,00 Schlecht +zwert5 = (5 - mean) / standardDev +print("zwert5", zwert5) \ No newline at end of file diff --git a/Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx b/Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx new file mode 100755 index 0000000..c0ce10b Binary files /dev/null and b/Sonstiges/TENTW/Daten_Umfrage_SPSS_20211113.xlsx differ diff --git a/Sonstiges/TENTW/linearregression.py b/Sonstiges/TENTW/linearregression.py new file mode 100644 index 0000000..3a366ec --- /dev/null +++ b/Sonstiges/TENTW/linearregression.py @@ -0,0 +1,79 @@ +# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 +# or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ +# pip3 install openpyxl +from numpy.matrixlib import defmatrix +import os +import pandas as pd # To read data +import math as m +import numpy as np +import scipy as sp +from scipy import stats +import matplotlib.pyplot as plt # To visualize + +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx')) + +df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" +print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) +print(df.head(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 0 NaN NaN NaN NaN NaN NaN +# 1 NaN 6.0 4.0 7.0 4.0 5.0 +# ... +#df = df.dropna() # CAUTION: drops every row that even contains single NaN ! + +print(df.tail(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 155 NaN 4.0 4.0 3.0 5.0 1.0 +# 156 NaN NaN NaN NaN NaN NaN +# (End of File) + +#print(df["HO_Score_Bewerbung_Gewichtet"][105:110]) +#for col in df.columns: + #print(col) + +# Calculate Mean, gew, inv +mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True) # skipna to ignore NaN rows +mwHO01_Diff = round(mwHO01_Diff, 2) +gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) +invHO01_Diff = 1 - gewHO01_Diff +# usw +print("HO01_Diff Mittelwert:", mwHO01_Diff) +print("HO01_Diff Gewichtet:", gewHO01_Diff) +print("HO01_Diff Invertiert:", invHO01_Diff) +# usw + +# Limit Dataframe Column and row Amount +dfColumnX = df["SS_Score"][1:156] +dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156] + +# Convert Dataframe Columns to Array containing the X- and Y- Values +arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array +arrY = np.asarray(dfColumnY) + +# Prepare Plot Image +plt.xlabel('SS_Score', color='black') +plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') +plt.xlim([0,50]) # set x-Axis View Range,[from,to] +plt.scatter(arrX, arrY) + +arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync + +# Convert again, as sorting seemed break the numpy array data format +arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" +arrY = np.asarray(arrY) + +# Use least Square Linear Regression from SciPy Stats +regr_results = sp.stats.linregress(arrX, arrY) +steigung = regr_results.slope +yAchsAbschn = regr_results.intercept +arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values + +print("y =", steigung, "* x +", yAchsAbschn) + +# Plot Linear Regression Line +plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html +plt.show() \ No newline at end of file