Using GPU-accelerated Ensembles with Automatic Differentiation

EnsembleGPUArray comes with derivative overloads for reverse mode automatic differentiation, and thus can be thrown into deep learning training loops. The following is an example of this use:

using OrdinaryDiffEq, SciMLSensitivity, Flux, DiffEqGPU, CUDA, Test
CUDA.allowscalar(false)

function modelf(du, u, p, t)
    du[1] = 1.01 * u[1] * p[1] * p[2]
end

function model()
    prob = ODEProblem(modelf, u0, (0.0, 1.0), pa)

    function prob_func(prob, i, repeat)
        remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
    end

    ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
    solve(ensemble_prob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()), saveat = 0.1,
        trajectories = 10)
end

# loss function
loss() = sum(abs2, 1.0 .- Array(model()))

data = Iterators.repeated((), 10)

cb = function () # callback function to observe training
    @show loss()
end

pa = [1.0, 2.0]
u0 = [3.0]
opt = ADAM(0.1)
println("Starting to train")

l1 = loss()

Flux.train!(loss, Flux.params([pa]), data, opt; cb = cb)

┌ Warning: Package cuDNN not found in current path.
│ - Run `import Pkg; Pkg.add("cuDNN")` to install the cuDNN package, then restart julia.
│ - If cuDNN is not installed, some Flux functionalities will not be available when running on the GPU.
└ @ FluxCUDAExt ~/.cache/julia-buildkite-plugin/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Flux/u7QSl/ext/FluxCUDAExt/FluxCUDAExt.jl:56
Starting to train
loss() = 185.0428068658791
loss() = 92.24232988545909
loss() = 47.578359342389
loss() = 25.594891269841597
loss() = 14.631980152159983
loss() = 9.21524236008198
loss() = 6.675379400460244
loss() = 5.658542480133761
loss() = 5.453213616020971
loss() = 5.674292984289548

Forward-mode automatic differentiation works as well, as demonstrated by its capability to recompile for Dual number arithmetic:

using OrdinaryDiffEq, DiffEqGPU, ForwardDiff, Test

function lorenz(du, u, p, t)
    du[1] = p[1] * (u[2] - u[1])
    du[2] = u[1] * (p[2] - u[3]) - u[2]
    du[3] = u[1] * u[2] - p[3] * u[3]
end

u0 = [ForwardDiff.Dual(1.0f0, (1.0, 0.0, 0.0)), ForwardDiff.Dual(0.0f0, (0.0, 1.0, 0.0)),
    ForwardDiff.Dual(0.0f0, (0.0, 0.0, 1.0))]
tspan = (0.0f0, 100.0f0)
p = (10.0f0, 28.0f0, 8 / 3.0f0)
prob = ODEProblem{true, SciMLBase.FullSpecialize}(lorenz, u0, tspan, p)
prob_func = (prob, i, repeat) -> remake(prob, p = rand(Float32, 3) .* p)
monteprob = EnsembleProblem(prob, prob_func = prob_func)
@time sol = solve(monteprob, Tsit5(), EnsembleGPUArray(CUDA.CUDABackend()),
    trajectories = 10_000,
    saveat = 1.0f0)

EnsembleSolution Solution of length 10000 with uType:
SciMLBase.ODESolution{ForwardDiff.Dual{Nothing, Float64, 3}, 2, uType, Nothing, Nothing, Vector{Float32}, rateType, SciMLBase.ODEProblem{Vector{ForwardDiff.Dual{Nothing, Float64, 3}}, Tuple{Float32, Float32}, true, Vector{Float32}, SciMLBase.ODEFunction{true, SciMLBase.FullSpecialize, typeof(Main.lorenz), LinearAlgebra.UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, OrdinaryDiffEq.Tsit5{typeof(OrdinaryDiffEq.trivial_limiter!), typeof(OrdinaryDiffEq.trivial_limiter!), Static.False}, IType, SciMLBase.DEStats, Nothing} where {uType, rateType, IType}