Source code for neuralib.util.gpu

import platform
import subprocess
from typing import Literal, TypedDict, cast

import polars as pl
from neuralib.util.table import rich_data_frame_table
from neuralib.util.verbose import fprint

__all__ = [
    'print_gpu_table',
    'gpu_available',
    'check_mps_available',
    'check_nvidia_cuda_available'
]

RUN_BACKEND = Literal['tensorflow', 'torch']


[docs] def gpu_available(backend: RUN_BACKEND, *, check_smi: bool = False) -> bool: """ Check if GPU is available. :param backend: {'torch', 'tensorflow'} :param check_smi: check if ``nvidia-smi`` is runnable :return: """ system = platform.system() if system in ('win32', 'Windows', 'Linux'): return check_nvidia_cuda_available(backend=backend, check_smi=check_smi) elif system == 'Darwin': return check_mps_available(backend=backend) else: raise NotImplementedError(f'Unsupported system {system}')
# ============= # # Windows/Linux # # ============= # class GPUInfoWin(TypedDict, total=False): id: str name: str driver_version: str | None gpu_load: str """percentage of GPU usage""" total_memory: float """in MB""" free_memory: float used_memory: float temperature: float """in celsius""" def _get_gpu_windows() -> list[GPUInfoWin]: import GPUtil # pyright: ignore[reportMissingImports] ret = [] gpus = GPUtil.getGPUs() for gpu in gpus: ret.append( GPUInfoWin( id=gpu.id, name=gpu.name, driver_version=gpu.driver, gpu_load=gpu.load, total_memory=gpu.memoryTotal, free_memory=gpu.memoryFree, used_memory=gpu.memoryUsed, temperature=gpu.temperature ) ) return ret
[docs] def check_nvidia_cuda_available(backend: RUN_BACKEND, check_smi: bool = False) -> bool: """ Checks if the GPU driver reacts and otherwise raises a custom error. Useful to check before long GPU-dependent processes. :param backend: {'torch', 'tensorflow'} :param check_smi: check if ``nvidia-smi`` is runnable """ if check_smi: process = subprocess.Popen('nvidia-smi', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) info, error = process.communicate() if process.returncode != 0: fprint(f"{error.decode('utf-8')}", vtype='warning') else: fprint("nvidia-smi command successful", vtype='pass') # is_available = False if backend == 'torch': import torch # pyright: ignore[reportMissingImports] if torch.cuda.is_available(): is_available = True elif backend == 'tensorflow': import tensorflow as tf # pyright: ignore[reportMissingModuleSource] if tf.test.is_built_with_cuda(): is_available = True else: raise ValueError(f'unknown backend: {backend}') # if is_available: fprint(f'cuda is available in current env using backend {backend}', vtype='pass') return True else: return False
# ====== # # Mac OS # # ====== # class GPUInfoMac(TypedDict, total=False): Chipset_Model: str Type: str Bus: str VRAM: str Vendor: str Device_ID: str Revision_ID: str Metal_Support: str mps_available: bool def _get_gpu_mac(backend: RUN_BACKEND) -> GPUInfoMac: """get mac gpu info from subprocess :param backend: {'torch', 'tensorflow'} :return ``GPUInfoMac`` """ output = subprocess.check_output(["system_profiler", "SPDisplaysDataType"], universal_newlines=True) lines = output.splitlines() ret: dict[str, str | bool] = {} cur_gpu = None for line in lines: line = line.strip() if len(line) == 0: continue elif line.startswith('Graphics/Displays:'): cur_gpu = {} elif line.startswith('Displays:'): break elif cur_gpu is not None and line.strip(): key, value = line.split(':', 1) if value: ret[key.strip()] = value.strip() ret['mps_available'] = check_mps_available(backend=backend) return cast(GPUInfoMac, ret)
[docs] def check_mps_available(backend: RUN_BACKEND) -> bool: """ Check if metal is available :param backend: {'torch', 'tensorflow'} :return: bool """ is_available = True if backend == 'torch': import torch # pyright: ignore[reportMissingImports] if not torch.backends.mps.is_available(): if not torch.backends.mps.is_built(): fprint('MPS not available because pytorch install not built with MPS enable', vtype='warning') else: fprint('MPS not available because current MacOs version is not 12.3+,' ' or do not have MPS-enabled device on this machine', vtype='warning') is_available = False elif backend == 'tensorflow': import tensorflow as tf # pyright: ignore[reportMissingModuleSource] if not tf.test.is_gpu_available(): fprint('MPS not available in tensorflow backend', vtype='warning') is_available = False else: raise NotImplementedError(f'unknown backend: {backend}') # if is_available: fprint(f'MPS is available using backend: {backend}', vtype='pass') return True else: return False