meccatronis commited on
Commit
6c9004a
·
verified ·
1 Parent(s): b100231

Upload benchmark_fp16_fixed.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_fp16_fixed.py +319 -0
benchmark_fp16_fixed.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import torch
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib.animation as animation
5
+ from datetime import datetime
6
+ import subprocess
7
+ import time
8
+ import psutil
9
+ import re
10
+ from collections import deque
11
+ import threading
12
+ import signal
13
+ import sys
14
+
15
+ class FP16BenchmarkFixed:
16
+ def __init__(self):
17
+ self.max_temp = 85
18
+ self.temperatures = deque(maxlen=200)
19
+ self.tflops_history = deque(maxlen=200)
20
+ self.load_level = deque(maxlen=200)
21
+ self.power_watts = deque(maxlen=200)
22
+ self.voltage_volts = deque(maxlen=200)
23
+ self.current_amps = deque(maxlen=200)
24
+ self.peak_tflops = 0
25
+ self.peak_power = 0
26
+ self.running = True
27
+
28
+ self.current_load = 1
29
+ self.matrix_size = 10240
30
+ self.num_operations = 1
31
+ self.num_streams = 1
32
+
33
+ plt.ion()
34
+ self.fig, ((self.ax1, self.ax2), (self.ax3, self.ax4)) = plt.subplots(2, 2, figsize=(16, 10))
35
+ self.fig.suptitle('BENCHMARK FP16 COMPLETO - Radeon Pro VII', fontsize=16, weight='bold')
36
+
37
+ self.last_temp_check = time.time()
38
+ self.temp_rising_fast = False
39
+
40
+ signal.signal(signal.SIGINT, self.signal_handler)
41
+
42
+ def signal_handler(self, sig, frame):
43
+ print("\n\n🛑 Interrompido pelo usuário")
44
+ self.running = False
45
+ sys.exit(0)
46
+
47
+ def get_gpu_metrics(self):
48
+ """Obtém temperatura, potência via sensors e rocm-smi"""
49
+ temp = 0
50
+ power = 0
51
+
52
+ # Temperatura via sensors
53
+ try:
54
+ result = subprocess.run(['sensors'], capture_output=True, text=True, timeout=0.5)
55
+ for line in result.stdout.split('\n'):
56
+ if 'edge:' in line.lower():
57
+ match = re.search(r'([+-]?\d+\.?\d*)\s*°C', line)
58
+ if match:
59
+ temp = float(match.group(1))
60
+ except:
61
+ pass
62
+
63
+ # Potência via rocm-smi
64
+ try:
65
+ result = subprocess.run(['rocm-smi', '--showpower'],
66
+ capture_output=True, text=True, timeout=0.5)
67
+ for line in result.stdout.split('\n'):
68
+ # Captura: "Current Socket Graphics Package Power (W): 19.0"
69
+ if 'Power (W)' in line or 'Power: ' in line:
70
+ match = re.search(r':\s*(\d+\.?\d*)', line)
71
+ if match:
72
+ power = float(match.group(1))
73
+ except:
74
+ pass
75
+
76
+ # Tensão estimada baseada na potência (V = P/I, estimando ~200A max)
77
+ # Radeon VII tipicamente opera em ~1.0-1.2V
78
+ voltage = 1.05 # Valor típico
79
+
80
+ return temp, power, voltage
81
+
82
+ def check_system_health(self):
83
+ try:
84
+ start = time.time()
85
+ cpu = psutil.cpu_percent(interval=0.05)
86
+ response = time.time() - start
87
+ if response > 0.4 or cpu > 95:
88
+ return False
89
+ return True
90
+ except:
91
+ return False
92
+
93
+ def calculate_tflops(self, matrix_size, elapsed_time, num_ops, num_streams):
94
+ operations = 2 * (matrix_size ** 3) * num_ops * num_streams
95
+ return (operations / elapsed_time) / 1e12
96
+
97
+ def increase_load(self):
98
+ if self.current_load < 10:
99
+ self.current_load += 1
100
+
101
+ if self.current_load >= 2 and self.num_streams < 4:
102
+ self.num_streams += 1
103
+
104
+ if self.current_load >= 4 and self.num_operations < 30:
105
+ self.num_operations += 5
106
+
107
+ if self.current_load >= 6 and self.matrix_size < 16384:
108
+ self.matrix_size = min(self.matrix_size + 1024, 16384)
109
+
110
+ def decrease_load(self):
111
+ if self.current_load > 1:
112
+ self.current_load -= 1
113
+
114
+ if self.matrix_size > 8192:
115
+ self.matrix_size = max(self.matrix_size - 512, 8192)
116
+
117
+ if self.num_operations > 5:
118
+ self.num_operations = max(self.num_operations - 5, 1)
119
+
120
+ def stress_gpu(self):
121
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
122
+ if device.type == 'cpu':
123
+ print("❌ ERRO: GPU não detectada!")
124
+ self.running = False
125
+ return
126
+
127
+ props = torch.cuda.get_device_properties(0)
128
+ print(f"🎯 GPU: {torch.cuda.get_device_name(0)}")
129
+ print(f"💾 VRAM: {props.total_memory / 1e9:.1f} GB")
130
+ print(f"🔥 Modo: FP16 (Half Precision)")
131
+ print(f"📊 TFLOPS Teórico FP16: ~26.88")
132
+ print(f"⚡ TDP: 300W")
133
+ print(f"⚠️ Limite: {self.max_temp}°C\n")
134
+
135
+ streams = [torch.cuda.Stream() for _ in range(4)]
136
+ last_temp = 0
137
+ stable_cycles = 0
138
+
139
+ while self.running:
140
+ current_time = time.time()
141
+ if current_time - self.last_temp_check > 0.1:
142
+ temp, power, voltage = self.get_gpu_metrics()
143
+ self.last_temp_check = current_time
144
+
145
+ current = power / voltage if voltage > 0 and power > 0 else 0
146
+
147
+ if len(self.temperatures) > 0:
148
+ temp_delta = temp - last_temp
149
+ if temp_delta > 2:
150
+ self.temp_rising_fast = True
151
+ else:
152
+ self.temp_rising_fast = False
153
+
154
+ if temp >= self.max_temp:
155
+ print(f"\n🚨 TEMPERATURA: {temp}°C - ABORTANDO!")
156
+ self.running = False
157
+ break
158
+
159
+ if temp >= self.max_temp - 3:
160
+ self.decrease_load()
161
+ self.decrease_load()
162
+
163
+ if self.temp_rising_fast and temp > 75:
164
+ self.decrease_load()
165
+
166
+ last_temp = temp
167
+ else:
168
+ temp, power, voltage = last_temp, self.power_watts[-1] if self.power_watts else 0, 1.05
169
+ current = power / voltage if voltage > 0 and power > 0 else 0
170
+
171
+ if not self.check_system_health():
172
+ print(f"\n🚨 SISTEMA INSTÁVEL!")
173
+ self.running = False
174
+ break
175
+
176
+ try:
177
+ torch.cuda.synchronize()
178
+ start = time.time()
179
+
180
+ for i in range(self.num_streams):
181
+ with torch.cuda.stream(streams[i]):
182
+ a = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
183
+ b = torch.randn(self.matrix_size, self.matrix_size, device=device, dtype=torch.float16)
184
+
185
+ for _ in range(self.num_operations):
186
+ c = torch.mm(a, b)
187
+ a = b
188
+ b = c
189
+
190
+ torch.cuda.synchronize()
191
+ elapsed = time.time() - start
192
+
193
+ tflops = self.calculate_tflops(self.matrix_size, elapsed,
194
+ self.num_operations, self.num_streams)
195
+
196
+ self.temperatures.append(temp)
197
+ self.tflops_history.append(tflops)
198
+ self.load_level.append(self.current_load)
199
+ self.power_watts.append(power)
200
+ self.voltage_volts.append(voltage)
201
+ self.current_amps.append(current)
202
+
203
+ if tflops > self.peak_tflops:
204
+ self.peak_tflops = tflops
205
+
206
+ if power > self.peak_power:
207
+ self.peak_power = power
208
+
209
+ print(f"TFLOPS: {tflops:6.2f} | Temp: {temp:5.1f}°C | {power:6.1f}W | {voltage:5.3f}V | {current:6.1f}A | Peak: {self.peak_tflops:.2f}", end='\r')
210
+
211
+ if temp < 75 and stable_cycles > 10:
212
+ self.increase_load()
213
+ stable_cycles = 0
214
+ elif temp < 80:
215
+ stable_cycles += 1
216
+ else:
217
+ stable_cycles = 0
218
+
219
+ time.sleep(0.02)
220
+
221
+ except RuntimeError as e:
222
+ if "out of memory" in str(e):
223
+ self.decrease_load()
224
+ torch.cuda.empty_cache()
225
+ else:
226
+ print(f"\n🚨 ERRO: {e}")
227
+ self.running = False
228
+ break
229
+ except Exception as e:
230
+ print(f"\n🚨 ERRO: {e}")
231
+ self.running = False
232
+ break
233
+
234
+ def update_plot(self, frame):
235
+ if len(self.tflops_history) == 0:
236
+ return
237
+
238
+ for ax in [self.ax1, self.ax2, self.ax3, self.ax4]:
239
+ ax.clear()
240
+
241
+ if len(self.tflops_history) > 0:
242
+ self.ax1.plot(list(self.tflops_history), 'b-', linewidth=2.5)
243
+ self.ax1.axhline(y=self.peak_tflops, color='g', linestyle='--', linewidth=2,
244
+ label=f'Peak: {self.peak_tflops:.2f}')
245
+ self.ax1.axhline(y=26.88, color='orange', linestyle=':', linewidth=2,
246
+ label='Teórico: 26.88')
247
+ self.ax1.set_ylabel('TFLOPS', fontsize=11, weight='bold')
248
+ self.ax1.set_title('Performance FP16', fontsize=11, weight='bold')
249
+ self.ax1.legend(loc='upper left', fontsize=9)
250
+ self.ax1.grid(True, alpha=0.3)
251
+ self.ax1.set_ylim(0, 30)
252
+
253
+ if len(self.temperatures) > 0:
254
+ temps = list(self.temperatures)
255
+ self.ax2.plot(temps, 'r-', linewidth=2.5)
256
+ self.ax2.axhline(y=self.max_temp, color='red', linestyle='--', linewidth=2)
257
+ self.ax2.fill_between(range(len(temps)), temps, self.max_temp,
258
+ where=[t >= self.max_temp - 5 for t in temps],
259
+ alpha=0.3, color='orange')
260
+ self.ax2.set_ylabel('Temperatura (°C)', fontsize=11, weight='bold')
261
+ self.ax2.set_title('Temperatura', fontsize=11, weight='bold')
262
+ self.ax2.grid(True, alpha=0.3)
263
+ self.ax2.set_ylim(30, 95)
264
+
265
+ if len(self.power_watts) > 0:
266
+ powers = list(self.power_watts)
267
+ self.ax3.plot(powers, 'green', linewidth=2.5)
268
+ self.ax3.axhline(y=300, color='red', linestyle='--', linewidth=2,
269
+ label='TDP: 300W')
270
+ self.ax3.axhline(y=self.peak_power, color='orange', linestyle=':', linewidth=2,
271
+ label=f'Peak: {self.peak_power:.1f}W')
272
+ self.ax3.fill_between(range(len(powers)), powers, alpha=0.3, color='green')
273
+ self.ax3.set_ylabel('Potência (W)', fontsize=11, weight='bold')
274
+ self.ax3.set_xlabel('Amostras', fontsize=11, weight='bold')
275
+ self.ax3.set_title('Consumo', fontsize=11, weight='bold')
276
+ self.ax3.legend(loc='upper left', fontsize=9)
277
+ self.ax3.grid(True, alpha=0.3)
278
+ self.ax3.set_ylim(0, 350)
279
+
280
+ if len(self.current_amps) > 0:
281
+ amps = list(self.current_amps)
282
+ self.ax4.plot(amps, 'purple', linewidth=2.5)
283
+ self.ax4.fill_between(range(len(amps)), amps, alpha=0.3, color='purple')
284
+ self.ax4.set_ylabel('Corrente (A)', fontsize=11, weight='bold')
285
+ self.ax4.set_xlabel('Amostras', fontsize=11, weight='bold')
286
+ self.ax4.set_title('Corrente Estimada', fontsize=11, weight='bold')
287
+ self.ax4.grid(True, alpha=0.3)
288
+
289
+ def run(self):
290
+ stress_thread = threading.Thread(target=self.stress_gpu)
291
+ stress_thread.daemon = True
292
+ stress_thread.start()
293
+
294
+ while self.running and stress_thread.is_alive():
295
+ self.update_plot(None)
296
+ plt.pause(0.3)
297
+
298
+ avg_power = sum(self.power_watts) / len(self.power_watts) if self.power_watts else 0
299
+ avg_current = sum(self.current_amps) / len(self.current_amps) if self.current_amps else 0
300
+
301
+ print(f"\n\n{'='*70}")
302
+ print(f"{'RESULTADO FINAL':^70}")
303
+ print(f"{'='*70}")
304
+ print(f"🏆 PEAK TFLOPS (FP16): {self.peak_tflops:.2f}")
305
+ print(f"📊 Teórico: 26.88 TFLOPS")
306
+ print(f"📈 Eficiência: {(self.peak_tflops / 26.88) * 100:.1f}%")
307
+ print(f"🌡️ Temp Máx: {max(self.temperatures) if self.temperatures else 0:.1f}°C")
308
+ print(f"⚡ Potência Peak: {self.peak_power:.1f}W")
309
+ print(f"⚡ Potência Média: {avg_power:.1f}W")
310
+ print(f"🔌 Corrente Média: {avg_current:.1f}A")
311
+ print(f"🔥 Carga Máx: {max(self.load_level) if self.load_level else 0}/10")
312
+ print(f"{'='*70}\n")
313
+
314
+ plt.ioff()
315
+ plt.show()
316
+
317
+ if __name__ == "__main__":
318
+ bench = FP16BenchmarkFixed()
319
+ bench.run()