From a00cd8b9c1e9866a58d135f3b64cc7e0f29c6d47 Mon Sep 17 00:00:00 2001 From: AUTOMATIC <16777216c@gmail.com> Date: Sun, 12 Mar 2023 21:04:17 +0300 Subject: [PATCH] attempt to fix memory monitor with multiple CUDA devices --- modules/memmon.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/modules/memmon.py b/modules/memmon.py index a7060f58..4018edcc 100644 --- a/modules/memmon.py +++ b/modules/memmon.py @@ -23,12 +23,16 @@ class MemUsageMonitor(threading.Thread): self.data = defaultdict(int) try: - torch.cuda.mem_get_info() + self.cuda_mem_get_info() torch.cuda.memory_stats(self.device) except Exception as e: # AMD or whatever print(f"Warning: caught exception '{e}', memory monitor disabled") self.disabled = True + def cuda_mem_get_info(self): + index = self.device.index if self.device.index is not None else torch.cuda.current_device() + return torch.cuda.mem_get_info(index) + def run(self): if self.disabled: return @@ -43,10 +47,10 @@ class MemUsageMonitor(threading.Thread): self.run_flag.clear() continue - self.data["min_free"] = torch.cuda.mem_get_info()[0] + self.data["min_free"] = self.cuda_mem_get_info()[0] while self.run_flag.is_set(): - free, total = torch.cuda.mem_get_info() # calling with self.device errors, torch bug? + free, total = self.cuda_mem_get_info() self.data["min_free"] = min(self.data["min_free"], free) time.sleep(1 / self.opts.memmon_poll_rate) @@ -70,7 +74,7 @@ class MemUsageMonitor(threading.Thread): def read(self): if not self.disabled: - free, total = torch.cuda.mem_get_info() + free, total = self.cuda_mem_get_info() self.data["free"] = free self.data["total"] = total