Code fuer Hausarbeit a-c,e

This commit is contained in:
dev weycloud
2022-01-22 17:08:15 +01:00
parent b3e4a699e4
commit 14f7b483f9
7 changed files with 416 additions and 2 deletions

View File

@@ -0,0 +1,84 @@
```
# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
# based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
# pip3 install openpyxl
import os
import pandas as pd # To read data
import math as m
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt # To visualize
# location will help to open files in the same directory as the py-script
__location__ = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
print(df.head(10))
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
# 0 NaN NaN NaN NaN NaN NaN
# 1 NaN 6.0 4.0 7.0 4.0 5.0
# ...
#df = df.dropna() # CAUTION: drops every row that even contains single NaN !
print(df.tail(10))
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
# 155 NaN 4.0 4.0 3.0 5.0 1.0
# 156 NaN NaN NaN NaN NaN NaN
# (End of File)
#print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
#for col in df.columns:
#print(col)
# Calculate Mean, gew, inv
mwHO01_Diff = df["HO01_Diff"][1:156] # Limit to Column and row Amount
mwHO01_Diff = mwHO01_Diff.mean(skipna=True) # Columns arithm. mean, skipna to ignore NaN rows
mwHO01_Diff = round(mwHO01_Diff, 2)
normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) # Norm
invHO01_Diff = 1 - normHO01_Diff # invert
# usw
print("HO01_Diff Mittelwert:", mwHO01_Diff)
print("HO01_Diff Normiert:", normHO01_Diff)
print("HO01_Diff Invertiert:", invHO01_Diff)
# usw
# Choose Dataframe Columns and row Amount
dfColumnX = df["SS_Score"][1:156]
dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
# Convert Dataframe Columns to Array containing the X- and Y- Values
arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array
arrY = np.asarray(dfColumnY)
# Prepare Plot Image
plt.xlabel('SS_Score', color='black')
plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
plt.xlim([0,50]) # set x-Axis View Range,[from,to]
plt.scatter(arrX, arrY)
arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync
# Convert again, as sorting seemed to break the numpy array data format
arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]"
arrY = np.asarray(arrY)
# Use least Square Linear Regression from SciPy Stats
regr_results = sp.stats.linregress(arrX, arrY)
steigung = regr_results.slope
yAchsAbschn = regr_results.intercept
arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
print("y =", steigung, "* x +", yAchsAbschn)
# Plot Linear Regression Line
plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
plt.show()
```

View File

@@ -0,0 +1,9 @@
,battery_power,int_memory,ram
count,200.0,200.0,200.0
mean,1264.56,33.485,2153.125
std,441.5502229610862,17.79559470640905,1140.4263724281582
min,504.0,2.0,263.0
20%,857.6,16.0,870.8000000000001
50%,1249.5,33.0,2172.5
80%,1721.4000000000003,51.0,3317.600000000001
max,1999.0,64.0,3976.0
1 battery_power int_memory ram
2 count 200.0 200.0 200.0
3 mean 1264.56 33.485 2153.125
4 std 441.5502229610862 17.79559470640905 1140.4263724281582
5 min 504.0 2.0 263.0
6 20% 857.6 16.0 870.8000000000001
7 50% 1249.5 33.0 2172.5
8 80% 1721.4000000000003 51.0 3317.600000000001
9 max 1999.0 64.0 3976.0

View File

@@ -0,0 +1,211 @@
id,battery_power,bluetooth,dual_sim,4G,int_memory,ram
1,1043,1,1,0,5,3476
2,841,1,1,1,61,3895
3,1807,1,0,0,27,2396
4,1546,0,1,1,25,3893
5,1434,0,0,1,49,1773
6,1464,1,1,1,50,3506
7,1718,0,0,0,47,3873
8,833,0,1,0,62,1495
9,1111,1,1,1,25,3485
10,1520,0,0,0,25,651
11,1500,0,0,0,55,3866
12,1343,0,0,1,34,3911
13,900,1,1,0,30,439
14,1190,1,1,0,19,992
15,630,0,0,1,51,2751
16,1846,1,0,1,53,563
17,1985,0,1,1,26,2083
18,1042,0,0,1,48,2187
19,1231,1,1,1,37,3902
20,1488,0,0,0,37,2524
21,968,0,0,1,7,1357
22,529,0,1,0,60,3456
23,1558,0,1,0,50,1641
24,533,1,1,0,58,2322
25,1037,0,1,0,5,3862
26,1025,0,1,1,43,557
27,1858,0,1,0,17,2427
28,980,0,0,0,8,625
29,644,1,1,0,15,3836
30,1024,1,1,1,38,854
31,1981,0,1,1,11,2383
32,1380,0,1,1,56,926
33,1557,1,1,0,32,3048
34,1201,1,1,0,49,824
35,1074,1,0,0,45,874
36,1175,1,0,0,33,1617
37,1280,0,1,1,8,3309
38,1715,0,1,1,28,1786
39,1165,0,0,0,40,3128
40,567,0,0,0,2,2243
41,1952,1,0,0,47,994
42,822,1,1,0,43,2749
43,685,1,1,0,12,793
44,1388,1,1,1,17,324
45,1972,0,0,0,48,366
46,1411,1,0,1,57,1228
47,1094,0,1,0,20,1413
48,1653,0,1,1,13,2617
49,916,0,1,0,33,1798
50,1712,0,0,0,29,2693
51,882,1,1,0,32,2437
52,632,0,0,1,33,837
53,1442,1,1,0,56,2815
54,900,0,0,0,47,1440
55,1630,1,1,0,20,3470
56,1596,1,1,0,24,1251
57,1272,1,0,1,54,3181
58,1640,0,1,0,7,1361
59,1111,0,1,0,15,1630
60,1889,1,0,0,25,3689
61,1907,1,1,0,22,3586
62,529,0,1,0,56,3942
63,578,0,0,1,38,1431
64,1634,1,1,1,39,2167
65,1533,1,1,1,22,1248
66,660,0,0,1,47,1986
67,1847,1,0,1,28,1378
68,1206,1,1,0,10,2959
69,549,1,0,1,53,1893
70,1705,1,0,1,23,1676
71,1366,0,1,0,36,1912
72,1991,0,0,0,58,1947
73,1102,1,0,1,40,2734
74,1452,0,0,1,32,3617
75,1810,1,1,0,63,3414
76,1166,1,1,1,41,482
77,881,1,1,0,6,2813
78,1134,1,1,1,64,661
79,1031,1,1,1,20,2546
80,1376,1,1,1,34,3922
81,1391,0,1,1,56,1199
82,979,0,0,0,24,3779
83,1075,1,0,0,36,3870
84,968,0,1,1,61,858
85,1999,0,1,1,15,3840
86,1626,0,0,0,20,454
87,942,0,1,0,28,3953
88,1182,0,0,1,42,1633
89,1982,1,1,0,48,3035
90,1373,0,1,0,27,966
91,1151,0,1,0,44,1761
92,1650,1,1,0,51,1938
93,1663,1,1,0,17,2820
94,1965,1,0,1,3,305
95,679,0,1,0,41,2838
96,1465,0,0,0,3,2042
97,1809,1,1,0,27,700
98,757,1,1,1,26,2593
99,1034,1,0,1,47,1835
100,1119,1,0,0,23,3121
101,559,1,1,1,24,2023
102,1204,1,0,0,19,737
103,1008,0,1,1,15,450
104,1397,1,1,0,19,2928
105,697,0,1,1,34,1694
106,1939,1,0,0,58,2593
107,1039,0,1,0,15,745
108,1605,1,0,1,51,1310
109,1094,0,1,0,34,2743
110,769,1,1,1,30,3976
111,861,1,1,0,40,3931
112,504,0,1,0,63,3455
113,1930,1,1,1,64,1533
114,1795,1,1,0,52,3876
115,1363,0,1,1,2,3239
116,1376,1,0,1,38,3628
117,1981,1,0,0,19,3663
118,1901,0,0,0,62,1786
119,1319,0,1,0,5,3502
120,859,1,0,0,58,815
121,1664,1,0,0,25,275
122,955,0,1,0,62,3758
123,517,0,1,0,38,2747
124,1806,0,0,1,64,1809
125,1348,0,0,1,50,2086
126,1455,0,0,0,16,444
127,1611,0,1,NaN,25,3467
128,1573,1,1,1,24,2776
129,557,0,0,1,23,3170
130,1599,1,0,0,31,1234
131,1051,1,1,0,16,305
132,1857,1,1,0,14,1745
133,1986,0,0,0,24,1707
134,591,1,0,0,21,2220
135,1140,0,1,1,56,3130
136,923,1,0,1,10,1500
137,1582,NaN,0,0,33,2145
138,723,0,0,0,63,488
139,1251,1,1,1,54,3863
140,574,1,1,1,60,2913
141,948,1,1,1,60,2094
142,1571,0,1,0,44,3141
143,564,1,0,0,33,2573
144,1466,0,1,1,57,
145,597,1,1,0,16,3788
146,895,1,1,1,9,3445
147,1535,0,1,1,37,3241
148,1832,1,0,0,44,2976
149,1045,1,0,1,58,2241
150,1483,1,0,1,61,3843
151,976,0,0,1, ,3261
152,1840,1,1,0,46,3264
153,1807,0,1,0,8,826
154,624,1,1,0,30,1314
155,1963,0,0,1,38,2699
156,1307,1,0,1,4,2565
157,1933,1,1, ,23,3973
158,1496,1,1,0,42,3537
159,1532,1,0,0,7,3449
160,1004,0,1,1,8,3895
161,1483,1,1,1,38,2777
162,945,0,0,0,47,2904
163,1081,NaN,0,1,17,3774
164,1012,0,1,1,32,3034
165,1762,0,0,0,50,2940
166,796,1,1,1,36,
167,1547,0,0,1,50,1168
168,988,0,0,1,12,376
169,1180,1,0,1,43,3510
170,852,1,0,1,3,542
171,607,1,0,1,18,550
172,1765,0,0,0,24,791
173,1250,0,0,0,63,1895
174,1577,0,1,0,55,609
175,1153,1,1,0,14,263
176,651,1,0,0, ,2141
177,1186,1,1,0,25,1270
178,1429,1,0,0,33,403
179,556,0,0,1,10,2040
180,1735,1,0,1,21,2597
181,1859,0,0,1,42,714
182,915,0,0,0,10,423
183,890,NaN,0,0,9,1238
184,758,0,0,0,48,926
185,541,0,0,1,28,2704
186,586,0,1,1,33,2817
187,762,1,1,1,39,2271
188,683,0,1,1,9,1513
189,1526,0,1,0,39,339
190,1771,1,0,0,39,794
191,639,0,0,0,28,391
192,1783,1,0,1,23,541
193,1933,0,0,1,55,648
194,1384,,1,1,62,3386
195,1770,0,0,0,26,1651
196,1202,0,0,0,49,1177
197,885,0,1,0,3,2887
198,1629,1,0,1,2,2178
199,1072,1,1,1,4,2878
200,1863,1,1,0,64,3201
201,1739,0,1,0,51,1490
202,895,1,0,0,23,2724
203,1278,1,1,0,56,3032
204,562,1,1,1,43,3352
205,1249,1,1,0,38,3195
206,1811,0,0,1,25,1677
207,560,1,1,1,12,2620
208,1773,1,0,1,61,1061
209,1715,1,0,0,11,1018
210,725,0,1,1,26,1370
1 id battery_power bluetooth dual_sim 4G int_memory ram
2 1 1043 1 1 0 5 3476
3 2 841 1 1 1 61 3895
4 3 1807 1 0 0 27 2396
5 4 1546 0 1 1 25 3893
6 5 1434 0 0 1 49 1773
7 6 1464 1 1 1 50 3506
8 7 1718 0 0 0 47 3873
9 8 833 0 1 0 62 1495
10 9 1111 1 1 1 25 3485
11 10 1520 0 0 0 25 651
12 11 1500 0 0 0 55 3866
13 12 1343 0 0 1 34 3911
14 13 900 1 1 0 30 439
15 14 1190 1 1 0 19 992
16 15 630 0 0 1 51 2751
17 16 1846 1 0 1 53 563
18 17 1985 0 1 1 26 2083
19 18 1042 0 0 1 48 2187
20 19 1231 1 1 1 37 3902
21 20 1488 0 0 0 37 2524
22 21 968 0 0 1 7 1357
23 22 529 0 1 0 60 3456
24 23 1558 0 1 0 50 1641
25 24 533 1 1 0 58 2322
26 25 1037 0 1 0 5 3862
27 26 1025 0 1 1 43 557
28 27 1858 0 1 0 17 2427
29 28 980 0 0 0 8 625
30 29 644 1 1 0 15 3836
31 30 1024 1 1 1 38 854
32 31 1981 0 1 1 11 2383
33 32 1380 0 1 1 56 926
34 33 1557 1 1 0 32 3048
35 34 1201 1 1 0 49 824
36 35 1074 1 0 0 45 874
37 36 1175 1 0 0 33 1617
38 37 1280 0 1 1 8 3309
39 38 1715 0 1 1 28 1786
40 39 1165 0 0 0 40 3128
41 40 567 0 0 0 2 2243
42 41 1952 1 0 0 47 994
43 42 822 1 1 0 43 2749
44 43 685 1 1 0 12 793
45 44 1388 1 1 1 17 324
46 45 1972 0 0 0 48 366
47 46 1411 1 0 1 57 1228
48 47 1094 0 1 0 20 1413
49 48 1653 0 1 1 13 2617
50 49 916 0 1 0 33 1798
51 50 1712 0 0 0 29 2693
52 51 882 1 1 0 32 2437
53 52 632 0 0 1 33 837
54 53 1442 1 1 0 56 2815
55 54 900 0 0 0 47 1440
56 55 1630 1 1 0 20 3470
57 56 1596 1 1 0 24 1251
58 57 1272 1 0 1 54 3181
59 58 1640 0 1 0 7 1361
60 59 1111 0 1 0 15 1630
61 60 1889 1 0 0 25 3689
62 61 1907 1 1 0 22 3586
63 62 529 0 1 0 56 3942
64 63 578 0 0 1 38 1431
65 64 1634 1 1 1 39 2167
66 65 1533 1 1 1 22 1248
67 66 660 0 0 1 47 1986
68 67 1847 1 0 1 28 1378
69 68 1206 1 1 0 10 2959
70 69 549 1 0 1 53 1893
71 70 1705 1 0 1 23 1676
72 71 1366 0 1 0 36 1912
73 72 1991 0 0 0 58 1947
74 73 1102 1 0 1 40 2734
75 74 1452 0 0 1 32 3617
76 75 1810 1 1 0 63 3414
77 76 1166 1 1 1 41 482
78 77 881 1 1 0 6 2813
79 78 1134 1 1 1 64 661
80 79 1031 1 1 1 20 2546
81 80 1376 1 1 1 34 3922
82 81 1391 0 1 1 56 1199
83 82 979 0 0 0 24 3779
84 83 1075 1 0 0 36 3870
85 84 968 0 1 1 61 858
86 85 1999 0 1 1 15 3840
87 86 1626 0 0 0 20 454
88 87 942 0 1 0 28 3953
89 88 1182 0 0 1 42 1633
90 89 1982 1 1 0 48 3035
91 90 1373 0 1 0 27 966
92 91 1151 0 1 0 44 1761
93 92 1650 1 1 0 51 1938
94 93 1663 1 1 0 17 2820
95 94 1965 1 0 1 3 305
96 95 679 0 1 0 41 2838
97 96 1465 0 0 0 3 2042
98 97 1809 1 1 0 27 700
99 98 757 1 1 1 26 2593
100 99 1034 1 0 1 47 1835
101 100 1119 1 0 0 23 3121
102 101 559 1 1 1 24 2023
103 102 1204 1 0 0 19 737
104 103 1008 0 1 1 15 450
105 104 1397 1 1 0 19 2928
106 105 697 0 1 1 34 1694
107 106 1939 1 0 0 58 2593
108 107 1039 0 1 0 15 745
109 108 1605 1 0 1 51 1310
110 109 1094 0 1 0 34 2743
111 110 769 1 1 1 30 3976
112 111 861 1 1 0 40 3931
113 112 504 0 1 0 63 3455
114 113 1930 1 1 1 64 1533
115 114 1795 1 1 0 52 3876
116 115 1363 0 1 1 2 3239
117 116 1376 1 0 1 38 3628
118 117 1981 1 0 0 19 3663
119 118 1901 0 0 0 62 1786
120 119 1319 0 1 0 5 3502
121 120 859 1 0 0 58 815
122 121 1664 1 0 0 25 275
123 122 955 0 1 0 62 3758
124 123 517 0 1 0 38 2747
125 124 1806 0 0 1 64 1809
126 125 1348 0 0 1 50 2086
127 126 1455 0 0 0 16 444
128 127 1611 0 1 NaN 25 3467
129 128 1573 1 1 1 24 2776
130 129 557 0 0 1 23 3170
131 130 1599 1 0 0 31 1234
132 131 1051 1 1 0 16 305
133 132 1857 1 1 0 14 1745
134 133 1986 0 0 0 24 1707
135 134 591 1 0 0 21 2220
136 135 1140 0 1 1 56 3130
137 136 923 1 0 1 10 1500
138 137 1582 NaN 0 0 33 2145
139 138 723 0 0 0 63 488
140 139 1251 1 1 1 54 3863
141 140 574 1 1 1 60 2913
142 141 948 1 1 1 60 2094
143 142 1571 0 1 0 44 3141
144 143 564 1 0 0 33 2573
145 144 1466 0 1 1 57
146 145 597 1 1 0 16 3788
147 146 895 1 1 1 9 3445
148 147 1535 0 1 1 37 3241
149 148 1832 1 0 0 44 2976
150 149 1045 1 0 1 58 2241
151 150 1483 1 0 1 61 3843
152 151 976 0 0 1 3261
153 152 1840 1 1 0 46 3264
154 153 1807 0 1 0 8 826
155 154 624 1 1 0 30 1314
156 155 1963 0 0 1 38 2699
157 156 1307 1 0 1 4 2565
158 157 1933 1 1 23 3973
159 158 1496 1 1 0 42 3537
160 159 1532 1 0 0 7 3449
161 160 1004 0 1 1 8 3895
162 161 1483 1 1 1 38 2777
163 162 945 0 0 0 47 2904
164 163 1081 NaN 0 1 17 3774
165 164 1012 0 1 1 32 3034
166 165 1762 0 0 0 50 2940
167 166 796 1 1 1 36
168 167 1547 0 0 1 50 1168
169 168 988 0 0 1 12 376
170 169 1180 1 0 1 43 3510
171 170 852 1 0 1 3 542
172 171 607 1 0 1 18 550
173 172 1765 0 0 0 24 791
174 173 1250 0 0 0 63 1895
175 174 1577 0 1 0 55 609
176 175 1153 1 1 0 14 263
177 176 651 1 0 0 2141
178 177 1186 1 1 0 25 1270
179 178 1429 1 0 0 33 403
180 179 556 0 0 1 10 2040
181 180 1735 1 0 1 21 2597
182 181 1859 0 0 1 42 714
183 182 915 0 0 0 10 423
184 183 890 NaN 0 0 9 1238
185 184 758 0 0 0 48 926
186 185 541 0 0 1 28 2704
187 186 586 0 1 1 33 2817
188 187 762 1 1 1 39 2271
189 188 683 0 1 1 9 1513
190 189 1526 0 1 0 39 339
191 190 1771 1 0 0 39 794
192 191 639 0 0 0 28 391
193 192 1783 1 0 1 23 541
194 193 1933 0 0 1 55 648
195 194 1384 1 1 62 3386
196 195 1770 0 0 0 26 1651
197 196 1202 0 0 0 49 1177
198 197 885 0 1 0 3 2887
199 198 1629 1 0 1 2 2178
200 199 1072 1 1 1 4 2878
201 200 1863 1 1 0 64 3201
202 201 1739 0 1 0 51 1490
203 202 895 1 0 0 23 2724
204 203 1278 1 1 0 56 3032
205 204 562 1 1 1 43 3352
206 205 1249 1 1 0 38 3195
207 206 1811 0 0 1 25 1677
208 207 560 1 1 1 12 2620
209 208 1773 1 0 1 61 1061
210 209 1715 1 0 0 11 1018
211 210 725 0 1 1 26 1370

View File

@@ -0,0 +1,107 @@
import numpy as np # pip3 install numpy
import pandas as pd # pip3 install pandas
import matplotlib.pyplot as plt
import scipy as sp
from scipy import stats
### a) Einlesen der Quelldaten
# Pandas-Methode read_csv() zum Einlesen nutzen, wobei die Spalte namens "id" ausgelassen wird
gewuenschteSpalten = ["battery_power", "bluetooth", "dual_sim", "4G", "int_memory", "ram"]
df = pd.read_csv('mobile_device_data.csv', usecols=gewuenschteSpalten)
# Print inkl. Dataframe-Methode head() aufrufen
# Parameter n: Ausgabe der ersten 12 Zeilen (Zeile 0 bis 11 = 12 Zeilen)
print(df.head(n=12))
# Ausgabe:
# battery_power bluetooth dual_sim 4G int_memory ram
# 0 1043 1.0 1 0 5 3476
# ... ... ... ... ... ... ...
# 11 1343 0.0 0 1 34 3911
### b) Bereinigen
# Dataframe enthält verschiedene Arten nicht auswertbarer Zellen:
# 1. Zellen in denen vorher schon NaN (NotANumber) steht
# 2. Zellen, in denen ein leerer String steht (' ')
# -> umwandeln von (' ') in "NaN", per DataFrame-Methode replace()
df.replace(to_replace=' ', value=np.nan, inplace=True)
# 1. und 2. lassen sich jetzt auf die gleiche Weise wie folgt ausgeben,
# und es werden 4 Zeilen gefunden, in denen das Bluetooth Feld "NaN" ist,
# und insgesamt 6 Zeilen, in denen das 4G, int_memory und ram Feld "NaN" sind:
# print(df.isna().sum())
# Ausgabe:
# battery_power 0
# bluetooth 4
# dual_sim 0
# 4G 2
# int_memory 2
# ram 2
# Dataframe-Methode "dropna" aufrufen, um solche Einträge zu löschen:
# Parameter axis=0 : Zeile wird gelöscht
# Parameter how='any' : Zum Löschen genügt eine einzelne leere Zelle
# Parameter inplace=True : Die Operation wird direkt auf das DataFrame angewendet
df.dropna(axis=0, how='any', inplace=True)
# Nach dieser Änderung sind nur noch 200 Zeilen im DataFrame
print(len(df.index))
# Ausgabe:
# 200
### c) Univariate Analyse
# Die Datentypen der gewünschten Merkmale werden nicht von vornherein alle als Ganzzahlen (Integer)
# interpretiert (vgl. print(df.info()) ), daher erst in solche umwandeln.
# Wenn man das nicht macht, kann .describe() nicht ordentlich mit gemischten Spaltentypen umgehen.
# Die Beispielausgabe wäre sonst: "mean 1264.560000 NaN NaN"
dfMetrischeMerkmale = df[["battery_power", "int_memory", "ram"]].astype(int)
dfUnivariateAnalyse = dfMetrischeMerkmale[["battery_power", "int_memory", "ram"]].describe(include='all', percentiles=[0.2, 0.5, 0.8])
dfUnivariateAnalyse.to_csv('UnivariateAnalyse.csv')
# Ausgabe:
# battery_power int_memory ram
# count 200.000000 200.000000 200.000000
# mean 1264.560000 33.485000 2153.125000
# std 441.550223 17.795595 1140.426372
# min 504.000000 2.000000 263.000000
# 20% 857.600000 16.000000 870.800000
# 50% 1249.500000 33.000000 2172.500000
# 80% 1721.400000 51.000000 3317.600000
# max 1999.000000 64.000000 3976.000000
### d) Balkendiagramme
dfNominaleMerkmale = df[["bluetooth", "dual_sim", "4G"]].astype(int)
#dfNominaleSummen = dfNominaleMerkmale["bluetooth", "dual_sim", "4G"].sum()
#print(dfNominaleMerkmale)
#dfNominaleMerkmale.plot(kind="hist")
#plt.show()
### e) Korrellationen nach Pearson und Lineare Regression zweier Merkmale
print(dfMetrischeMerkmale.corr(method="pearson"))
# Ausgabe:
# battery_power int_memory ram
# battery_power 1.000000 0.050449 -0.069141
# int_memory 0.050449 1.000000 0.047475
# ram -0.069141 0.047475 1.000000
# -> ram und battery_power korrelieren am Stärksten, wenn auch negativ:
# Per Modul SciPy Stats: Methode der kleinsten Quadrate für die Lineare Regression nutzen
werteListeX = dfMetrischeMerkmale["ram"]
werteListeY = dfMetrischeMerkmale["battery_power"]
regrErgebnisse = sp.stats.linregress(werteListeX, werteListeY)
steigung = round(regrErgebnisse.slope, 4)
yAchsAbschn = round(regrErgebnisse.intercept, 4)
arrYpredicted = steigung * werteListeX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
print("Regressionsgleichung:", "y =", steigung, "* x +", yAchsAbschn)
# Plot Linear Regression Line
plt.plot(werteListeX, arrYpredicted, label='Lin Regression', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
# Show Plot Image
plt.xlabel('ram', color='black')
plt.ylabel('battery_power', color='black')
#plt.xlim([0,50]) # set x-Axis View Range,[from,to]
plt.scatter(werteListeX, werteListeY)
plt.show()

View File

@@ -91,18 +91,21 @@ df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung
# Vorlesung 6 # Vorlesung 6
df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"]
print("Diff_roh_erklaert_Mean", df["Diff_roh_erklaert"].mean())
df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen
df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"] df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"]
df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std() df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std()
print("HO_Final mean", df["H0_Final"].mean(), "HO_Final std:", df["H0_Final"].std())
print("SS_Mean", df["SS_Score"].mean())
df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4 df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4
df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2) df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2)
df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1 df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1
df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7 df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7
#df.to_csv(__location__ + '/tmpViewFile.csv', sep=";") df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")
# Open Dataframe in Webbrowser: # Open Dataframe in Webbrowser:
def showDf(df): def showDf(df):