diff --git a/CHANGELOG.md b/CHANGELOG.md
index c9909de4..68cf87ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,111 +1,25 @@
-# v4.6.1-beta
+# v5.0.0
+This version is first stable unified 3 in 1 GPU+CPU release, OpenCL support built in in miner and not require additional external dependencies on compile time, NVIDIA CUDA available as external [CUDA plugin](https://github.com/xmrig/xmrig-cuda), for convenient, 3 in 1 downloads with recent CUDA version also provided.
+
+This release based on 4.x.x series and include all features from v4.6.2-beta, changelog below include only the most important changes, [full changelog](doc/CHANGELOG_OLD.md) available separately.
+
 - [#1272](https://github.com/xmrig/xmrig/pull/1272) Optimized hashrate calculation.
-- [#1273](https://github.com/xmrig/xmrig/issues/1273) Fixed crash when use `GET /2/backends` API endpoint with disabled CUDA.
-
-# v4.6.0-beta
 - [#1263](https://github.com/xmrig/xmrig/pull/1263) Added new option `dataset_host` for NVIDIA GPUs with less than 4 GB memory (RandomX only).
-
-# v4.5.0-beta
-- Added NVIDIA CUDA support via external [CUDA plugun](https://github.com/xmrig/xmrig-cuda). XMRig now is unified 3 in 1 miner.
-
-# v4.4.0-beta
 - [#1068](https://github.com/xmrig/xmrig/pull/1068) Added support for `self-select` stratum protocol extension.
-- [#1240](https://github.com/xmrig/xmrig/pull/1240) Sync with the latest RandomX code.
-- [#1241](https://github.com/xmrig/xmrig/issues/1241) Fixed regression with colors on old Windows systems.
-- [#1243](https://github.com/xmrig/xmrig/pull/1243) Fixed incorrect OpenCL memory size detection in some cases.
-- [#1247](https://github.com/xmrig/xmrig/pull/1247) Fixed ARM64 RandomX code alignment.
-- [#1248](https://github.com/xmrig/xmrig/pull/1248) Fixed RandomX code cache cleanup on iOS/Darwin.
-
-# v4.3.1-beta
-- Fixed regression in v4.3.0, miner didn't create `cn` mining profile with default config example.
-
-# v4.3.0-beta
 - [#1227](https://github.com/xmrig/xmrig/pull/1227) Added new algorithm `rx/arq`, RandomX variant for upcoming ArQmA fork.
 - [#808](https://github.com/xmrig/xmrig/issues/808#issuecomment-539297156) Added experimental support for persistent memory for CPU mining threads.
 - [#1221](https://github.com/xmrig/xmrig/issues/1221) Improved RandomX dataset memory usage and initialization speed for NUMA machines.
-
-# v4.2.1-beta
-- [#1150](https://github.com/xmrig/xmrig/issues/1150) Fixed build on FreeBSD.
 - [#1175](https://github.com/xmrig/xmrig/issues/1175) Fixed support for systems where total count of NUMA nodes not equal usable nodes count.
-- [#1199](https://github.com/xmrig/xmrig/issues/1199) Fixed excessive memory allocation for OpenCL threads with low intensity.
-- [#1212](https://github.com/xmrig/xmrig/issues/1212) Fixed low RandomX performance after fast algorithm switching.
-
-# v4.2.0-beta
-- [#1202](https://github.com/xmrig/xmrig/issues/1202) Fixed algorithm verification in donate strategy.
-- Added per pool option `coin` with single possible value `monero` for pools without algorithm negotiation, for upcoming Monero fork.
 - Added config option `cpu/max-threads-hint` and command line option `--cpu-max-threads-hint`.
-
-# v4.1.0-beta
-- **OpenCL backend disabled by default.**.
-- [#1183](https://github.com/xmrig/xmrig/issues/1183) Fixed compatibility with systemd.
 - [#1185](https://github.com/xmrig/xmrig/pull/1185) Added JIT compiler for RandomX on ARMv8.
 - Improved API endpoint `GET /2/backends` and added support for this endpoint to [workers.xmrig.info](http://workers.xmrig.info).
 - Added command line option `--no-cpu` to disable CPU backend.
 - Added OpenCL specific command line options: `--opencl`, `--opencl-devices`, `--opencl-platform`, `--opencl-loader` and `--opencl-no-cache`.
+- Added CUDA specific command line options: `--cuda`, `--cuda-loader` and `--no-nvml`.
 - Removed command line option `--http-enabled`, HTTP API enabled automatically if any other `--http-*` option provided.
-
-# v4.0.1-beta
-- [#1177](https://github.com/xmrig/xmrig/issues/1177) Fixed compatibility with old AMD drivers.
-- [#1180](https://github.com/xmrig/xmrig/issues/1180) Fixed possible duplicated shares after algorithm switching.
-- Added support for case if not all backend threads successfully started.
-- Fixed wrong config file permissions after write (only gcc builds on recent Windows 10 affected).
-
-# v4.0.0-beta
 - [#1172](https://github.com/xmrig/xmrig/issues/1172) **Added OpenCL mining backend.**
   - [#268](https://github.com/xmrig/xmrig-amd/pull/268) [#270](https://github.com/xmrig/xmrig-amd/pull/270) [#271](https://github.com/xmrig/xmrig-amd/pull/271) [#273](https://github.com/xmrig/xmrig-amd/pull/273) [#274](https://github.com/xmrig/xmrig-amd/pull/274) [#1171](https://github.com/xmrig/xmrig/pull/1171) Added RandomX support for OpenCL, thanks [@SChernykh](https://github.com/SChernykh).
 - Algorithm `cn/wow` removed, as no longer alive. 
 
-# v3.2.0
-- Added per pool option `coin` with single possible value `monero` for pools without algorithm negotiation, for upcoming Monero fork.
-- [#1183](https://github.com/xmrig/xmrig/issues/1183) Fixed compatibility with systemd.
-
-# v3.1.3
-- [#1180](https://github.com/xmrig/xmrig/issues/1180) Fixed possible duplicated shares after algorithm switching.
-- Fixed wrong config file permissions after write (only gcc builds on recent Windows 10 affected).
-
-# v3.1.2
-- Many RandomX optimizations and fixes.
-  - [#1132](https://github.com/xmrig/xmrig/issues/1132) Fixed build on CentOS 7.
-  - [#1163](https://github.com/xmrig/xmrig/pull/1163) Optimized soft AES code, up to +30% hashrate on CPU without AES support and other optimizations.
-  - [#1166](https://github.com/xmrig/xmrig/pull/1166) Fixed crash when initialize dataset with big threads count (eg 272).
-  - [#1168](https://github.com/xmrig/xmrig/pull/1168) Optimized loading from scratchpad.
-- [#1128](https://github.com/xmrig/xmrig/issues/1128) Fixed CMake 2.8 compatibility.
-
-# v3.1.1
-- [#1133](https://github.com/xmrig/xmrig/issues/1133) Fixed syslog regression.
-- [#1138](https://github.com/xmrig/xmrig/issues/1138) Fixed multiple network bugs.
-- [#1141](https://github.com/xmrig/xmrig/issues/1141) Fixed log in background mode.
-- [#1142](https://github.com/xmrig/xmrig/pull/1142) RandomX hashrate improved by 0.5-1.5% depending on variant and CPU.
-- [#1146](https://github.com/xmrig/xmrig/pull/1146) Fixed race condition in RandomX thread init.
-- [#1148](https://github.com/xmrig/xmrig/pull/1148) Fixed, on Linux linker marking entire executable as having an executable stack.
-- Fixed, for Argon2 algorithms command line options like `--threads` was ignored.
-- Fixed command line options for single pool, free order allowed again.
-
-# v3.1.0
-- [#1107](https://github.com/xmrig/xmrig/issues/1107#issuecomment-522235892) Added Argon2 algorithm family: `argon2/chukwa` and `argon2/wrkz`.
-
-# v3.0.0
-- **[#1111](https://github.com/xmrig/xmrig/pull/1111) Added RandomX (`rx/test`) algorithm for testing and benchmarking.**
-- **[#1036](https://github.com/xmrig/xmrig/pull/1036) Added RandomWOW (`rx/wow`) algorithm for [Wownero](http://wownero.org/).**
-- **[#1050](https://github.com/xmrig/xmrig/pull/1050) Added RandomXL (`rx/loki`) algorithm for [Loki](https://loki.network/).**
-- **[#1077](https://github.com/xmrig/xmrig/issues/1077) Added NUMA support via hwloc**.
-- **Added flexible [multi algorithm](doc/CPU.md) configuration.**
-- **Added unlimited switching between incompatible algorithms, all mining options can be changed in runtime.**
-- [#257](https://github.com/xmrig/xmrig-nvidia/pull/257) New logging subsystem, file and syslog now always without colors.
-- [#314](https://github.com/xmrig/xmrig-proxy/issues/314) Added donate over proxy feature.
-- [#1007](https://github.com/xmrig/xmrig/issues/1007) Old HTTP API backend based on libmicrohttpd, replaced to custom HTTP server (libuv + http_parser).
-- [#1010](https://github.com/xmrig/xmrig/pull/1010#issuecomment-482632107) Added daemon support (solo mining).
-- [#1066](https://github.com/xmrig/xmrig/issues/1066#issuecomment-518080529) Added error message if pool not ready for RandomX.
-- [#1105](https://github.com/xmrig/xmrig/issues/1105) Improved auto configuration for `cn-pico` algorithm.
-- Added commands `pause` and `resume` via JSON RPC 2.0 API (`POST /json_rpc`).
-- Added command line option `--export-topology` for export hwloc topology to a XML file.
-- Breaked backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
-- Options `av`, `safe` and `max-cpu-usage` removed.
-- Algorithm `cn/msr` renamed to `cn/fast`.
-- Algorithm `cn/xtl` removed.
-- API endpoint `GET /1/threads` replaced to `GET /2/backends`.
-- Added global uptime and extended connection information in API.
-- API now return current algorithm.
-
 # Previous versions
 [doc/CHANGELOG_OLD.md](doc/CHANGELOG_OLD.md)
diff --git a/README.md b/README.md
index ac978393..cf286f86 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ API:
 
 OpenCL backend:
       --opencl                  enable OpenCL mining backend
-      --opencl-devices=N        list of OpenCL devices to use
+      --opencl-devices=N        comma separated list of OpenCL devices to use
       --opencl-platform=N       OpenCL platform index or name
       --opencl-loader=PATH      path to OpenCL-ICD-Loader (OpenCL.dll or libOpenCL.so)
       --opencl-no-cache         disable OpenCL cache
@@ -83,6 +83,7 @@ OpenCL backend:
 CUDA backend:
       --cuda                    enable CUDA mining backend
       --cuda-loader=PATH        path to CUDA plugin (xmrig-cuda.dll or libxmrig-cuda.so)
+      --cuda-devices=N          comma separated list of CUDA devices to use
       --no-nvml                 disable NVML (NVIDIA Management Library) support
 
 Logging:
diff --git a/doc/CHANGELOG_OLD.md b/doc/CHANGELOG_OLD.md
index 58be062b..70059190 100644
--- a/doc/CHANGELOG_OLD.md
+++ b/doc/CHANGELOG_OLD.md
@@ -1,3 +1,116 @@
+# v4.6.2-beta
+- [#1274](https://github.com/xmrig/xmrig/issues/1274) Added `--cuda-devices` command line option.
+- [#1277](https://github.com/xmrig/xmrig/pull/1277) Fixed function names for clang on Apple.
+
+# v4.6.1-beta
+- [#1272](https://github.com/xmrig/xmrig/pull/1272) Optimized hashrate calculation.
+- [#1273](https://github.com/xmrig/xmrig/issues/1273) Fixed crash when use `GET /2/backends` API endpoint with disabled CUDA.
+
+# v4.6.0-beta
+- [#1263](https://github.com/xmrig/xmrig/pull/1263) Added new option `dataset_host` for NVIDIA GPUs with less than 4 GB memory (RandomX only).
+
+# v4.5.0-beta
+- Added NVIDIA CUDA support via external [CUDA plugun](https://github.com/xmrig/xmrig-cuda). XMRig now is unified 3 in 1 miner.
+
+# v4.4.0-beta
+- [#1068](https://github.com/xmrig/xmrig/pull/1068) Added support for `self-select` stratum protocol extension.
+- [#1240](https://github.com/xmrig/xmrig/pull/1240) Sync with the latest RandomX code.
+- [#1241](https://github.com/xmrig/xmrig/issues/1241) Fixed regression with colors on old Windows systems.
+- [#1243](https://github.com/xmrig/xmrig/pull/1243) Fixed incorrect OpenCL memory size detection in some cases.
+- [#1247](https://github.com/xmrig/xmrig/pull/1247) Fixed ARM64 RandomX code alignment.
+- [#1248](https://github.com/xmrig/xmrig/pull/1248) Fixed RandomX code cache cleanup on iOS/Darwin.
+
+# v4.3.1-beta
+- Fixed regression in v4.3.0, miner didn't create `cn` mining profile with default config example.
+
+# v4.3.0-beta
+- [#1227](https://github.com/xmrig/xmrig/pull/1227) Added new algorithm `rx/arq`, RandomX variant for upcoming ArQmA fork.
+- [#808](https://github.com/xmrig/xmrig/issues/808#issuecomment-539297156) Added experimental support for persistent memory for CPU mining threads.
+- [#1221](https://github.com/xmrig/xmrig/issues/1221) Improved RandomX dataset memory usage and initialization speed for NUMA machines.
+
+# v4.2.1-beta
+- [#1150](https://github.com/xmrig/xmrig/issues/1150) Fixed build on FreeBSD.
+- [#1175](https://github.com/xmrig/xmrig/issues/1175) Fixed support for systems where total count of NUMA nodes not equal usable nodes count.
+- [#1199](https://github.com/xmrig/xmrig/issues/1199) Fixed excessive memory allocation for OpenCL threads with low intensity.
+- [#1212](https://github.com/xmrig/xmrig/issues/1212) Fixed low RandomX performance after fast algorithm switching.
+
+# v4.2.0-beta
+- [#1202](https://github.com/xmrig/xmrig/issues/1202) Fixed algorithm verification in donate strategy.
+- Added per pool option `coin` with single possible value `monero` for pools without algorithm negotiation, for upcoming Monero fork.
+- Added config option `cpu/max-threads-hint` and command line option `--cpu-max-threads-hint`.
+
+# v4.1.0-beta
+- **OpenCL backend disabled by default.**.
+- [#1183](https://github.com/xmrig/xmrig/issues/1183) Fixed compatibility with systemd.
+- [#1185](https://github.com/xmrig/xmrig/pull/1185) Added JIT compiler for RandomX on ARMv8.
+- Improved API endpoint `GET /2/backends` and added support for this endpoint to [workers.xmrig.info](http://workers.xmrig.info).
+- Added command line option `--no-cpu` to disable CPU backend.
+- Added OpenCL specific command line options: `--opencl`, `--opencl-devices`, `--opencl-platform`, `--opencl-loader` and `--opencl-no-cache`.
+- Removed command line option `--http-enabled`, HTTP API enabled automatically if any other `--http-*` option provided.
+
+# v4.0.1-beta
+- [#1177](https://github.com/xmrig/xmrig/issues/1177) Fixed compatibility with old AMD drivers.
+- [#1180](https://github.com/xmrig/xmrig/issues/1180) Fixed possible duplicated shares after algorithm switching.
+- Added support for case if not all backend threads successfully started.
+- Fixed wrong config file permissions after write (only gcc builds on recent Windows 10 affected).
+
+# v4.0.0-beta
+- [#1172](https://github.com/xmrig/xmrig/issues/1172) **Added OpenCL mining backend.**
+  - [#268](https://github.com/xmrig/xmrig-amd/pull/268) [#270](https://github.com/xmrig/xmrig-amd/pull/270) [#271](https://github.com/xmrig/xmrig-amd/pull/271) [#273](https://github.com/xmrig/xmrig-amd/pull/273) [#274](https://github.com/xmrig/xmrig-amd/pull/274) [#1171](https://github.com/xmrig/xmrig/pull/1171) Added RandomX support for OpenCL, thanks [@SChernykh](https://github.com/SChernykh).
+- Algorithm `cn/wow` removed, as no longer alive. 
+
+# v3.2.0
+- Added per pool option `coin` with single possible value `monero` for pools without algorithm negotiation, for upcoming Monero fork.
+- [#1183](https://github.com/xmrig/xmrig/issues/1183) Fixed compatibility with systemd.
+
+# v3.1.3
+- [#1180](https://github.com/xmrig/xmrig/issues/1180) Fixed possible duplicated shares after algorithm switching.
+- Fixed wrong config file permissions after write (only gcc builds on recent Windows 10 affected).
+
+# v3.1.2
+- Many RandomX optimizations and fixes.
+  - [#1132](https://github.com/xmrig/xmrig/issues/1132) Fixed build on CentOS 7.
+  - [#1163](https://github.com/xmrig/xmrig/pull/1163) Optimized soft AES code, up to +30% hashrate on CPU without AES support and other optimizations.
+  - [#1166](https://github.com/xmrig/xmrig/pull/1166) Fixed crash when initialize dataset with big threads count (eg 272).
+  - [#1168](https://github.com/xmrig/xmrig/pull/1168) Optimized loading from scratchpad.
+- [#1128](https://github.com/xmrig/xmrig/issues/1128) Fixed CMake 2.8 compatibility.
+
+# v3.1.1
+- [#1133](https://github.com/xmrig/xmrig/issues/1133) Fixed syslog regression.
+- [#1138](https://github.com/xmrig/xmrig/issues/1138) Fixed multiple network bugs.
+- [#1141](https://github.com/xmrig/xmrig/issues/1141) Fixed log in background mode.
+- [#1142](https://github.com/xmrig/xmrig/pull/1142) RandomX hashrate improved by 0.5-1.5% depending on variant and CPU.
+- [#1146](https://github.com/xmrig/xmrig/pull/1146) Fixed race condition in RandomX thread init.
+- [#1148](https://github.com/xmrig/xmrig/pull/1148) Fixed, on Linux linker marking entire executable as having an executable stack.
+- Fixed, for Argon2 algorithms command line options like `--threads` was ignored.
+- Fixed command line options for single pool, free order allowed again.
+
+# v3.1.0
+- [#1107](https://github.com/xmrig/xmrig/issues/1107#issuecomment-522235892) Added Argon2 algorithm family: `argon2/chukwa` and `argon2/wrkz`.
+
+# v3.0.0
+- **[#1111](https://github.com/xmrig/xmrig/pull/1111) Added RandomX (`rx/test`) algorithm for testing and benchmarking.**
+- **[#1036](https://github.com/xmrig/xmrig/pull/1036) Added RandomWOW (`rx/wow`) algorithm for [Wownero](http://wownero.org/).**
+- **[#1050](https://github.com/xmrig/xmrig/pull/1050) Added RandomXL (`rx/loki`) algorithm for [Loki](https://loki.network/).**
+- **[#1077](https://github.com/xmrig/xmrig/issues/1077) Added NUMA support via hwloc**.
+- **Added flexible [multi algorithm](doc/CPU.md) configuration.**
+- **Added unlimited switching between incompatible algorithms, all mining options can be changed in runtime.**
+- [#257](https://github.com/xmrig/xmrig-nvidia/pull/257) New logging subsystem, file and syslog now always without colors.
+- [#314](https://github.com/xmrig/xmrig-proxy/issues/314) Added donate over proxy feature.
+- [#1007](https://github.com/xmrig/xmrig/issues/1007) Old HTTP API backend based on libmicrohttpd, replaced to custom HTTP server (libuv + http_parser).
+- [#1010](https://github.com/xmrig/xmrig/pull/1010#issuecomment-482632107) Added daemon support (solo mining).
+- [#1066](https://github.com/xmrig/xmrig/issues/1066#issuecomment-518080529) Added error message if pool not ready for RandomX.
+- [#1105](https://github.com/xmrig/xmrig/issues/1105) Improved auto configuration for `cn-pico` algorithm.
+- Added commands `pause` and `resume` via JSON RPC 2.0 API (`POST /json_rpc`).
+- Added command line option `--export-topology` for export hwloc topology to a XML file.
+- Breaked backward compatibility with previous configs and command line, `variant` option replaced to `algo`, global option `algo` removed, all CPU related settings moved to `cpu` object.
+- Options `av`, `safe` and `max-cpu-usage` removed.
+- Algorithm `cn/msr` renamed to `cn/fast`.
+- Algorithm `cn/xtl` removed.
+- API endpoint `GET /1/threads` replaced to `GET /2/backends`.
+- Added global uptime and extended connection information in API.
+- API now return current algorithm.
+
 # v2.99.6-beta
 - Added commands `pause` and `resume` via JSON RPC 2.0 API (`POST /json_rpc`).
 - Fixed autoconfig regression (since 2.99.5), mostly `rx/wow` was affected by this bug.
diff --git a/src/backend/cuda/CudaBackend.cpp b/src/backend/cuda/CudaBackend.cpp
index 812ee270..b351df75 100644
--- a/src/backend/cuda/CudaBackend.cpp
+++ b/src/backend/cuda/CudaBackend.cpp
@@ -155,11 +155,14 @@ public:
             return;
         }
 
+        devices = CudaLib::devices(cuda.bfactor(), cuda.bsleep(), cuda.devicesHint());
+        if (devices.empty()) {
+            return printDisabled(kLabel, RED_S " (no devices)");
+        }
+
         Log::print(GREEN_BOLD(" * ") WHITE_BOLD("%-13s") WHITE_BOLD("%s") "/" WHITE_BOLD("%s") BLACK_BOLD("/%s"), kLabel,
                    CudaLib::version(runtimeVersion).c_str(), CudaLib::version(driverVersion).c_str(), CudaLib::pluginVersion());
 
-        devices = CudaLib::devices(cuda.bfactor(), cuda.bsleep());
-
 #       ifdef XMRIG_FEATURE_NVML
         if (cuda.isNvmlEnabled()) {
             if (NvmlLib::init(cuda.nvmlLoader())) {
@@ -172,7 +175,7 @@ public:
                            );
             }
             else {
-                printDisabled(kLabel, RED_S " (failed to load NVML)");
+                printDisabled(kNvmlLabel, RED_S " (failed to load NVML)");
             }
         }
         else {
diff --git a/src/backend/cuda/CudaConfig.cpp b/src/backend/cuda/CudaConfig.cpp
index 49a28d11..8f26c14c 100644
--- a/src/backend/cuda/CudaConfig.cpp
+++ b/src/backend/cuda/CudaConfig.cpp
@@ -78,6 +78,16 @@ rapidjson::Value xmrig::CudaConfig::toJSON(rapidjson::Document &doc) const
 
 std::vector<xmrig::CudaLaunchData> xmrig::CudaConfig::get(const Miner *miner, const Algorithm &algorithm, const std::vector<CudaDevice> &devices) const
 {
+    auto deviceIndex = [&devices](uint32_t index) -> int {
+        for (uint32_t i = 0; i < devices.size(); ++i) {
+            if (devices[i].index() == index) {
+                return i;
+            }
+        }
+
+        return -1;
+    };
+
     std::vector<CudaLaunchData> out;
     const auto &threads = m_threads.get(algorithm);
 
@@ -85,15 +95,16 @@ std::vector<xmrig::CudaLaunchData> xmrig::CudaConfig::get(const Miner *miner, co
         return out;
     }
 
-    out.reserve(threads.count() * 2);
+    out.reserve(threads.count());
 
     for (const auto &thread : threads.data()) {
-        if (thread.index() >= devices.size()) {
+        const int index = deviceIndex(thread.index());
+        if (index == -1) {
             LOG_INFO("%s" YELLOW(" skip non-existing device with index ") YELLOW_BOLD("%u"), cuda_tag(), thread.index());
             continue;
         }
 
-        out.emplace_back(miner, algorithm, thread, devices[thread.index()]);
+        out.emplace_back(miner, algorithm, thread, devices[static_cast<size_t>(index)]);
     }
 
     return out;
@@ -153,7 +164,7 @@ void xmrig::CudaConfig::generate()
         return;
     }
 
-    const auto devices = CudaLib::devices(bfactor(), bsleep());
+    const auto devices = CudaLib::devices(bfactor(), bsleep(), m_devicesHint);
     if (devices.empty()) {
         return;
     }
diff --git a/src/backend/cuda/CudaConfig.h b/src/backend/cuda/CudaConfig.h
index 77be3dd4..3f3957e6 100644
--- a/src/backend/cuda/CudaConfig.h
+++ b/src/backend/cuda/CudaConfig.h
@@ -43,16 +43,17 @@ public:
     std::vector<CudaLaunchData> get(const Miner *miner, const Algorithm &algorithm, const std::vector<CudaDevice> &devices) const;
     void read(const rapidjson::Value &value);
 
-    inline bool isEnabled() const                       { return m_enabled; }
-    inline bool isShouldSave() const                    { return m_shouldSave; }
-    inline const String &loader() const                 { return m_loader; }
-    inline const Threads<CudaThreads> &threads() const  { return m_threads; }
-    inline int32_t bfactor() const                      { return m_bfactor; }
-    inline int32_t bsleep() const                       { return m_bsleep; }
+    inline bool isEnabled() const                               { return m_enabled; }
+    inline bool isShouldSave() const                            { return m_shouldSave; }
+    inline const std::vector<uint32_t> &devicesHint() const     { return m_devicesHint; }
+    inline const String &loader() const                         { return m_loader; }
+    inline const Threads<CudaThreads> &threads() const          { return m_threads; }
+    inline int32_t bfactor() const                              { return m_bfactor; }
+    inline int32_t bsleep() const                               { return m_bsleep; }
 
 #   ifdef XMRIG_FEATURE_NVML
-    inline bool isNvmlEnabled() const                   { return m_nvml; }
-    inline const String &nvmlLoader() const             { return m_nvmlLoader; }
+    inline bool isNvmlEnabled() const                           { return m_nvml; }
+    inline const String &nvmlLoader() const                     { return m_nvmlLoader; }
 #   endif
 
 private:
diff --git a/src/backend/cuda/wrappers/CudaLib.cpp b/src/backend/cuda/wrappers/CudaLib.cpp
index 8d22fefb..37924457 100644
--- a/src/backend/cuda/wrappers/CudaLib.cpp
+++ b/src/backend/cuda/wrappers/CudaLib.cpp
@@ -209,7 +209,7 @@ std::string xmrig::CudaLib::version(uint32_t version)
 }
 
 
-std::vector<xmrig::CudaDevice> xmrig::CudaLib::devices(int32_t bfactor, int32_t bsleep) noexcept
+std::vector<xmrig::CudaDevice> xmrig::CudaLib::devices(int32_t bfactor, int32_t bsleep, const std::vector<uint32_t> &hints) noexcept
 {
     const uint32_t count = deviceCount();
     if (!count) {
@@ -219,10 +219,24 @@ std::vector<xmrig::CudaDevice> xmrig::CudaLib::devices(int32_t bfactor, int32_t
     std::vector<CudaDevice> out;
     out.reserve(count);
 
-    for (uint32_t i = 0; i < count; ++i) {
-        CudaDevice device(i, bfactor, bsleep);
-        if (device.isValid()) {
-            out.emplace_back(std::move(device));
+    if (hints.empty()) {
+        for (uint32_t i = 0; i < count; ++i) {
+            CudaDevice device(i, bfactor, bsleep);
+            if (device.isValid()) {
+                out.emplace_back(std::move(device));
+            }
+        }
+    }
+    else {
+        for (const uint32_t i : hints) {
+            if (i >= count) {
+                continue;
+            }
+
+            CudaDevice device(i, bfactor, bsleep);
+            if (device.isValid()) {
+                out.emplace_back(std::move(device));
+            }
         }
     }
 
diff --git a/src/backend/cuda/wrappers/CudaLib.h b/src/backend/cuda/wrappers/CudaLib.h
index f18ed350..4874112f 100644
--- a/src/backend/cuda/wrappers/CudaLib.h
+++ b/src/backend/cuda/wrappers/CudaLib.h
@@ -85,7 +85,7 @@ public:
     static int32_t deviceInt(nvid_ctx *ctx, DeviceProperty property) noexcept;
     static nvid_ctx *alloc(uint32_t id, int32_t bfactor, int32_t bsleep) noexcept;
     static std::string version(uint32_t version);
-    static std::vector<CudaDevice> devices(int32_t bfactor, int32_t bsleep) noexcept;
+    static std::vector<CudaDevice> devices(int32_t bfactor, int32_t bsleep, const std::vector<uint32_t> &hints) noexcept;
     static uint32_t deviceCount() noexcept;
     static uint32_t deviceUint(nvid_ctx *ctx, DeviceProperty property) noexcept;
     static uint32_t driverVersion() noexcept;
diff --git a/src/base/io/json/Json.cpp b/src/base/io/json/Json.cpp
index 03d4c65a..9a578293 100644
--- a/src/base/io/json/Json.cpp
+++ b/src/base/io/json/Json.cpp
@@ -50,7 +50,7 @@ bool xmrig::Json::getBool(const rapidjson::Value &obj, const char *key, bool def
 }
 
 
-const char *xmrig::Json::getString(const rapidjson::Value &obj, const char *key,  const char *defaultValue)
+const char *xmrig::Json::getString(const rapidjson::Value &obj, const char *key, const char *defaultValue)
 {
     assert(obj.IsObject());
 
diff --git a/src/core/config/ConfigTransform.cpp b/src/core/config/ConfigTransform.cpp
index ffd3fbd8..4ece6186 100644
--- a/src/core/config/ConfigTransform.cpp
+++ b/src/core/config/ConfigTransform.cpp
@@ -187,10 +187,14 @@ void xmrig::ConfigTransform::transform(rapidjson::Document &doc, int key, const
 
 #   ifdef XMRIG_FEATURE_CUDA
     case IConfig::CudaKey: /* --cuda */
-        return set(doc, kCuda, "enabled", true);
+        return set(doc, kCuda, kEnabled, true);
 
     case IConfig::CudaLoaderKey: /* --cuda-loader */
         return set(doc, kCuda, "loader", arg);
+
+    case IConfig::CudaDevicesKey: /* --cuda-devices */
+        set(doc, kCuda, kEnabled, true);
+        return set(doc, kCuda, "devices-hint", arg);
 #   endif
 
 #   ifdef XMRIG_FEATURE_NVML
diff --git a/src/core/config/Config_platform.h b/src/core/config/Config_platform.h
index e0df3d68..05b39952 100644
--- a/src/core/config/Config_platform.h
+++ b/src/core/config/Config_platform.h
@@ -109,6 +109,7 @@ static const option options[] = {
 #   ifdef XMRIG_FEATURE_CUDA
     { "cuda",                  0, nullptr, IConfig::CudaKey               },
     { "cuda-loader",           1, nullptr, IConfig::CudaLoaderKey         },
+    { "cuda-devices",          1, nullptr, IConfig::CudaDevicesKey        },
 #   endif
 #   ifdef XMRIG_FEATURE_NVML
     { "no-nvml",               0, nullptr, IConfig::NvmlKey               },
diff --git a/src/core/config/usage.h b/src/core/config/usage.h
index d9ef2316..bcc3abd1 100644
--- a/src/core/config/usage.h
+++ b/src/core/config/usage.h
@@ -101,7 +101,7 @@ static inline const std::string &usage()
 #   ifdef XMRIG_FEATURE_OPENCL
     u += "\nOpenCL backend:\n";
     u += "      --opencl                  enable OpenCL mining backend\n";
-    u += "      --opencl-devices=N        list of OpenCL devices to use\n";
+    u += "      --opencl-devices=N        comma separated list of OpenCL devices to use\n";
     u += "      --opencl-platform=N       OpenCL platform index or name\n";
     u += "      --opencl-loader=PATH      path to OpenCL-ICD-Loader (OpenCL.dll or libOpenCL.so)\n";
     u += "      --opencl-no-cache         disable OpenCL cache\n";
@@ -112,6 +112,7 @@ static inline const std::string &usage()
     u += "\nCUDA backend:\n";
     u += "      --cuda                    enable CUDA mining backend\n";
     u += "      --cuda-loader=PATH        path to CUDA plugin (xmrig-cuda.dll or libxmrig-cuda.so)\n";
+    u += "      --cuda-devices=N          comma separated list of CUDA devices to use\n";
 #   endif
 #   ifdef XMRIG_FEATURE_NVML
     u += "      --no-nvml                 disable NVML (NVIDIA Management Library) support\n";
diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S
index 13fd5c57..37c044c8 100644
--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@@ -25,26 +25,32 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+#if defined(__APPLE__)
+#define DECL(x) _##x
+#else
+#define DECL(x) x
+#endif
+
 	.arch armv8-a
 	.text
-	.global	randomx_program_aarch64
-	.global	randomx_program_aarch64_main_loop
-	.global	randomx_program_aarch64_vm_instructions
-	.global randomx_program_aarch64_imul_rcp_literals_end
-	.global	randomx_program_aarch64_vm_instructions_end
-	.global randomx_program_aarch64_cacheline_align_mask1
-	.global randomx_program_aarch64_cacheline_align_mask2
-	.global randomx_program_aarch64_update_spMix1
-	.global randomx_program_aarch64_vm_instructions_end_light
-	.global randomx_program_aarch64_light_cacheline_align_mask
-	.global randomx_program_aarch64_light_dataset_offset
-	.global randomx_init_dataset_aarch64
-	.global randomx_init_dataset_aarch64_end
-	.global randomx_calc_dataset_item_aarch64
-	.global randomx_calc_dataset_item_aarch64_prefetch
-	.global randomx_calc_dataset_item_aarch64_mix
-	.global randomx_calc_dataset_item_aarch64_store_result
-	.global randomx_calc_dataset_item_aarch64_end
+	.global DECL(randomx_program_aarch64)
+	.global DECL(randomx_program_aarch64_main_loop)
+	.global DECL(randomx_program_aarch64_vm_instructions)
+	.global DECL(randomx_program_aarch64_imul_rcp_literals_end)
+	.global DECL(randomx_program_aarch64_vm_instructions_end)
+	.global DECL(randomx_program_aarch64_cacheline_align_mask1)
+	.global DECL(randomx_program_aarch64_cacheline_align_mask2)
+	.global DECL(randomx_program_aarch64_update_spMix1)
+	.global DECL(randomx_program_aarch64_vm_instructions_end_light)
+	.global DECL(randomx_program_aarch64_light_cacheline_align_mask)
+	.global DECL(randomx_program_aarch64_light_dataset_offset)
+	.global DECL(randomx_init_dataset_aarch64)
+	.global DECL(randomx_init_dataset_aarch64_end)
+	.global DECL(randomx_calc_dataset_item_aarch64)
+	.global DECL(randomx_calc_dataset_item_aarch64_prefetch)
+	.global DECL(randomx_calc_dataset_item_aarch64_mix)
+	.global DECL(randomx_calc_dataset_item_aarch64_store_result)
+	.global DECL(randomx_calc_dataset_item_aarch64_end)
 
 # Register allocation
 
@@ -99,7 +105,7 @@
 # v31 -> scale mask   = 0x81f000000000000081f0000000000000
 
 	.balign 4
-randomx_program_aarch64:
+DECL(randomx_program_aarch64):
 	# Save callee-saved registers
 	sub	sp, sp, 192
 	stp	x16, x17, [sp]
@@ -187,7 +193,7 @@ randomx_program_aarch64:
 	ldr	q14, literal_v14
 	ldr	q15, literal_v15
 
-randomx_program_aarch64_main_loop:
+DECL(randomx_program_aarch64_main_loop):
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
 	lsr	x18, x10, 32
@@ -260,7 +266,7 @@ randomx_program_aarch64_main_loop:
 	orr	v23.16b, v23.16b, v30.16b
 
 	# Execute VM instructions
-randomx_program_aarch64_vm_instructions:
+DECL(randomx_program_aarch64_vm_instructions):
 
 	# 16 KB buffer for generated instructions
 	.fill 4096,4,0
@@ -278,7 +284,7 @@ literal_x27: .fill 1,8,0
 literal_x28: .fill 1,8,0
 literal_x29: .fill 1,8,0
 literal_x30: .fill 1,8,0
-randomx_program_aarch64_imul_rcp_literals_end:
+DECL(randomx_program_aarch64_imul_rcp_literals_end):
 
 literal_v0:  .fill 2,8,0
 literal_v1:  .fill 2,8,0
@@ -297,14 +303,14 @@ literal_v13: .fill 2,8,0
 literal_v14: .fill 2,8,0
 literal_v15: .fill 2,8,0
 
-randomx_program_aarch64_vm_instructions_end:
+DECL(randomx_program_aarch64_vm_instructions_end):
 
 	# mx ^= r[readReg2] ^ r[readReg3];
 	eor	x9, x9, x18
 
 	# Calculate dataset pointer for dataset prefetch
 	mov	w18, w9
-randomx_program_aarch64_cacheline_align_mask1:
+DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# Actual mask will be inserted by JIT compiler
 	and	x18, x18, 1
 	add	x18, x18, x1
@@ -317,12 +323,12 @@ randomx_program_aarch64_cacheline_align_mask1:
 
 	# Calculate dataset pointer for dataset read
 	mov	w10, w9
-randomx_program_aarch64_cacheline_align_mask2:
+DECL(randomx_program_aarch64_cacheline_align_mask2):
 	# Actual mask will be inserted by JIT compiler
 	and	x10, x10, 1
 	add	x10, x10, x1
 
-randomx_program_aarch64_xor_with_dataset_line:
+DECL(randomx_program_aarch64_xor_with_dataset_line):
 	# xor integer registers with dataset data
 	ldp	x18, x19, [x10]
 	eor	x4, x4, x18
@@ -337,7 +343,7 @@ randomx_program_aarch64_xor_with_dataset_line:
 	eor	x14, x14, x18
 	eor	x15, x15, x19
 
-randomx_program_aarch64_update_spMix1:
+DECL(randomx_program_aarch64_update_spMix1):
 	# JIT compiler will replace it with "eor x10, config.readReg0, config.readReg1"
 	eor	x10, x0, x0
 
@@ -358,7 +364,7 @@ randomx_program_aarch64_update_spMix1:
 	stp	q18, q19, [x16, 32]
 
 	subs	x3, x3, 1
-	bne	randomx_program_aarch64_main_loop
+	bne	DECL(randomx_program_aarch64_main_loop)
 	
 	# Restore x0
 	ldr	x0, [sp], 16
@@ -392,7 +398,7 @@ randomx_program_aarch64_update_spMix1:
 
 	ret
 
-randomx_program_aarch64_vm_instructions_end_light:
+DECL(randomx_program_aarch64_vm_instructions_end_light):
 	sub	sp, sp, 96
 	stp	x0, x1, [sp, 64]
 	stp	x2, x30, [sp, 80]
@@ -409,26 +415,26 @@ randomx_program_aarch64_vm_instructions_end_light:
 	# x1 -> pointer to output
 	mov	x1, sp
 
-randomx_program_aarch64_light_cacheline_align_mask:
+DECL(randomx_program_aarch64_light_cacheline_align_mask):
 	# Actual mask will be inserted by JIT compiler
 	and	w2, w9, 1
 
 	# x2 -> item number
 	lsr	x2, x2, 6
 
-randomx_program_aarch64_light_dataset_offset:
+DECL(randomx_program_aarch64_light_dataset_offset):
 	# Apply dataset offset (filled in by JIT compiler)
 	add	x2, x2, 0
 	add	x2, x2, 0
 
-	bl	randomx_calc_dataset_item_aarch64
+	bl	DECL(randomx_calc_dataset_item_aarch64)
 
 	mov	x10, sp
 	ldp	x0, x1, [sp, 64]
 	ldp	x2, x30, [sp, 80]
 	add	sp, sp, 96
 
-	b	randomx_program_aarch64_xor_with_dataset_line
+	b	DECL(randomx_program_aarch64_xor_with_dataset_line)
 
 
 
@@ -439,26 +445,26 @@ randomx_program_aarch64_light_dataset_offset:
 # x2 -> start item
 # x3 -> end item
 
-randomx_init_dataset_aarch64:
+DECL(randomx_init_dataset_aarch64):
 	# Save x30 (return address)
 	str	x30, [sp, -16]!
 
 	# Load pointer to cache memory
 	ldr	x0, [x0]
 
-randomx_init_dataset_aarch64_main_loop:
-	bl	randomx_calc_dataset_item_aarch64
+DECL(randomx_init_dataset_aarch64_main_loop):
+	bl	DECL(randomx_calc_dataset_item_aarch64)
 	add	x1, x1, 64
 	add	x2, x2, 1
 	cmp	x2, x3
-	bne	randomx_init_dataset_aarch64_main_loop
+	bne	DECL(randomx_init_dataset_aarch64_main_loop)
 
 	# Restore x30 (return address)
 	ldr	x30, [sp], 16
 
 	ret
 
-randomx_init_dataset_aarch64_end:
+DECL(randomx_init_dataset_aarch64_end):
 
 # Input parameters
 #
@@ -476,7 +482,7 @@ randomx_init_dataset_aarch64_end:
 # x12 -> temporary
 # x13 -> temporary
 
-randomx_calc_dataset_item_aarch64:
+DECL(randomx_calc_dataset_item_aarch64):
 	sub	sp, sp, 112
 	stp	x0, x1, [sp]
 	stp	x2, x3, [sp, 16]
@@ -523,7 +529,7 @@ randomx_calc_dataset_item_aarch64:
 	ldr	x12, superscalarAdd7
 	eor	x7, x0, x12
 
-	b	randomx_calc_dataset_item_aarch64_prefetch
+	b	DECL(randomx_calc_dataset_item_aarch64_prefetch)
 
 superscalarMul0: .quad 6364136223846793005
 superscalarAdd1: .quad 9298411001130361340
@@ -536,7 +542,7 @@ superscalarAdd7: .quad 9549104520008361294
 
 # Prefetch -> SuperScalar hash -> Mix will be repeated N times
 
-randomx_calc_dataset_item_aarch64_prefetch:
+DECL(randomx_calc_dataset_item_aarch64_prefetch):
 	# Actual mask will be inserted by JIT compiler
 	and	x11, x10, 1
 	add	x11, x8, x11, lsl 6
@@ -544,7 +550,7 @@ randomx_calc_dataset_item_aarch64_prefetch:
 
 	# Generated SuperScalar hash program goes here
 
-randomx_calc_dataset_item_aarch64_mix:
+DECL(randomx_calc_dataset_item_aarch64_mix):
 	ldp	x12, x13, [x11]
 	eor	x0, x0, x12
 	eor	x1, x1, x13
@@ -558,7 +564,7 @@ randomx_calc_dataset_item_aarch64_mix:
 	eor	x6, x6, x12
 	eor	x7, x7, x13
 
-randomx_calc_dataset_item_aarch64_store_result:
+DECL(randomx_calc_dataset_item_aarch64_store_result):
 	stp	x0, x1, [x9]
 	stp	x2, x3, [x9, 16]
 	stp	x4, x5, [x9, 32]
@@ -575,4 +581,4 @@ randomx_calc_dataset_item_aarch64_store_result:
 
 	ret
 
-randomx_calc_dataset_item_aarch64_end:
+DECL(randomx_calc_dataset_item_aarch64_end):
diff --git a/src/version.h b/src/version.h
index ac6d4eec..287a92bd 100644
--- a/src/version.h
+++ b/src/version.h
@@ -28,15 +28,15 @@
 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "XMRig miner"
-#define APP_VERSION   "4.6.1-beta-mo1"
+#define APP_VERSION   "5.0.0-mo1"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com"
 #define APP_KIND      "miner"
 
-#define APP_VER_MAJOR  4
-#define APP_VER_MINOR  6
-#define APP_VER_PATCH  1
+#define APP_VER_MAJOR  5
+#define APP_VER_MINOR  0
+#define APP_VER_PATCH  0
 
 #ifdef _MSC_VER
 #   if (_MSC_VER >= 1920)