Custom deep reinforcement learning training process, the code does not report errors, but during the training process it can not be read, just like the data
    3 views (last 30 days)
  
       Show older comments
    
rng(0)
env = rlPredefinedEnv('CartPoleSimscapeModel-Continuous');
%Extract the observation and action specifications from the environment.
obsInfo = getObservationInfo(env);
actInfo = getActionInfo(env);
%Obtain the number of observations (numObs) and actions (numAct).
numObs = obsInfo.Dimension(1);
numAct = actInfo.Dimension(1);
%Set a sample time for the environment
Ts = 0.01;
Tf=20;
%%
%network
criticLayerSizes = [128 200];
actorLayerSizes = [128 200];
%createNetworkWeights;
statePath = [
    featureInputLayer(numObs,'Normalization','none','Name','observation')
    fullyConnectedLayer(128,'Name','CriticStateFC1')
    reluLayer('Name','CriticRelu1')
    fullyConnectedLayer(200,'Name','CriticStateFC2')];
actionPath = [
    featureInputLayer(1,'Normalization','none','Name','action')
    fullyConnectedLayer(200,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
    additionLayer(2,'Name','add')
    reluLayer('Name','CriticCommonRelu')
    fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph(statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
criticNetwork = dlnetwork(criticNetwork);
% Create the critic function approximator.
criticOptions = rlOptimizerOptions('LearnRate',1e-03,'GradientThreshold',1);
critic = rlQValueFunction(criticNetwork,obsInfo,actInfo);
criticOptimizer = rlOptimizer(criticOptions);
%ActorNetwork
actorNetwork = [
    featureInputLayer(numObs,'Normalization','none','Name','observation')
    fullyConnectedLayer(128,'Name','ActorFC1')
    reluLayer('Name','ActorRelu1')
    fullyConnectedLayer(200,'Name','ActorFC2')
    reluLayer('Name','ActorRelu2')
    fullyConnectedLayer(1,'Name','ActorFC3')
    tanhLayer('Name','ActorTanh1')
    scalingLayer('Name','ActorScaling','Scale',max(actInfo.UpperLimit))];
actorNetwork = dlnetwork(actorNetwork);
% Create the actor function approximator.
actorOptions = rlOptimizerOptions('LearnRate',5e-04,'GradientThreshold',1);
actor = rlContinuousDeterministicActor(actorNetwork,obsInfo,actInfo);
actorOptimizer  = rlOptimizer(actorOptions);
policy = rlDeterministicActorPolicy(actor);
agentOptions = rlDDPGAgentOptions(...
    'SampleTime',Ts,...
    'ActorOptimizerOptions',actorOptions,...
    'CriticOptimizerOptions',criticOptions,...
    'ExperienceBufferLength',1e6,...
    'MiniBatchSize',128);
agentOptions.NoiseOptions.Variance = 0.4;
agentOptions.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor,critic,agentOptions);
%%
%creat buffer
myBuffer.bufferSize = 500;
myBuffer.bufferIndex = 0;
myBuffer.currentBufferLength = 0;
myBuffer.observation = zeros(numObs,1,myBuffer.bufferSize);
myBuffer. nextObservation =zeros(numObs,1,myBuffer.bufferSize);
myBuffer.action = zeros(numAct,1,myBuffer.bufferSize);
myBuffer.reward = zeros(1,myBuffer.bufferSize);
myBuffer.isDone = zeros(1,myBuffer.bufferSize);
%processExpData structure
processExpData.Critic = critic;
processExpData.TargetCritic = critic;
processExpData.Actor = actor;
processExpData.TargetActor = actor;
processExpData.MyBuffer = myBuffer;
processExpData.CriticOptimizer = criticOptimizer;
processExpData.ActorOptimizer = actorOptimizer;
processExpData.MiniBatchSize = 128;
processExpData.DiscountFactor = 0.99;
processExpData.TargetSmoothFactor = 1e-3;
maxEpisodes = 1000;
maxSteps = ceil(Tf/Ts);
trainingTerminationValue = 480;
[trainingPlot,lineReward,lineAveReward] = hBuildFigure;%图像显示
% Enable the training visualization plot.
set(trainingPlot,'Visible','on');
%%
%train
doTraining = true;
if doTraining
    % Training loop
    for i = 1:maxEpisodes
        % update actor, critic
        agent = setActor(agent,actor);
        agent = setCritic(agent,critic);
        out=sim(agent, env);
        myBuffer.observations=out.Observation.observations.Data(:,:,1:myBuffer.bufferSize-1);
        myBuffer. nextObservation=out.Observation.observations.Data(:,:,2:myBuffer.bufferSize);
        myBuffer.action =out.Action.force.Data;
        myBuffer.reward = out.Reward.Data'; %转置
        myBuffer.isDone = out.IsDone.Data';
        %miniBatch 
        BatchSize.observations=myBuffer.observations(:,:,1:processExpData.MiniBatchSize);
        BatchSize.nextObservation=myBuffer.nextObservation(:,:,1:processExpData.MiniBatchSize);
        BatchSize.action=myBuffer.action(:,:,1:processExpData.MiniBatchSize);
        BatchSize.reward = myBuffer.reward(:,1:processExpData.MiniBatchSize);
        BatchSize.isDone = myBuffer.isDone(:,1:processExpData.MiniBatchSize);
        BatchSize.nextObs{1}=BatchSize.nextObservation;
        BatchSize.obs{1}=BatchSize.observations;
        for epoch=1:maxSteps
            if ~isempty(BatchSize)
            % Update network parameters using the mini-batch.
            [processExpData,actorParams] = learnFcn(processExpData,BatchSize);
            % Update the policy parameters using the actor parameters.
            policy = setLearnableParameters(policy,actorParams);
            end
        end
        % Extract the critic and actor networks from processExpData.
        critic = processExpData.Critic;
        actor  = processExpData.Actor;
        % Extract the cumulative reward and calculate average reward 
        % per step for this episode.
        episodeCumulativeReward = sum(BatchSize.reward);
        episodeCumulativeRewardVector = cat(2,...
          episodeCumulativeRewardVector,episodeCumulativeReward);
        movingAveReward = movmean(episodeCumulativeRewardVector,...
         aveWindowSize,2);
        addpoints(lineReward,episodeCt,episodeCumulativeReward);
        addpoints(lineAveReward,episodeCt,movingAveReward(end));
        drawnow;
        if max(movingAveReward) > trainingTerminationValue
           break
        end
    end
end
% %plot env
% obs = reset(env);
% plot(env);
% for maxStepsPerEpisode = 1:maxStepsPerEpisode
%     
%     % Select action according to trained policy
%     action = getAction(Actor,{obs});
%         
%     % Step the environment
%     [nextObs,reward,isdone] = step(env,action{1});
%     
%     % Check for terminal condition
%     if isdone
%         break
%     end
%     
%     obs = nextObs;
%     
% end
%%
function [processExpData,actorParams] = learnFcn(processExpData,BatchSize)
% Find the terminal experiences.
doneidx = (BatchSize.isDone == 1);
% Compute target next actions against the next observations.
nextAction = evaluate(processExpData.TargetActor,BatchSize.nextObs);%数据类型要变为cell
% compute qtarget = reward + gamma*Q(nextObservation,nextAction)
%                 = reward + gamma*expectedFutureReturn
targetq = BatchSize.reward;
% Bootstrap the target at nonterminal experiences.
expectedFutureReturn = ...
    getValue(processExpData.TargetCritic,BatchSize.nextObs,nextAction);
targetq(~doneidx) = targetq(~doneidx) + ...
    processExpData.DiscountFactor.*expectedFutureReturn(~doneidx);
% Compute critic gradient using deepCriticLoss function.
criticGradient = gradient(processExpData.Critic,@deepCriticLoss,...
    [BatchSize.obs,BatchSize.action],targetq);
% Update the critic parameters.
[processExpData.Critic,processExpData.CriticOptimizer] = update(...
    processExpData.CriticOptimizer,processExpData.Critic,...
    criticGradient);
% Compute the actor gradient using the deepActorGradient function. To
% accelerate the deepActorGradient function, the critic network is
% extracted outside the function and is passed in as a field to the
% actorGradData input struct.
actorGradData.CriticNet = getModel(processExpData.Critic);
actorGradData.MiniBatchSize = processExpData.MiniBatchSize;
actorGradient = customGradient(processExpData.Actor,@deepActorGradient,...
    BatchSize.obs,actorGradData);
% Update the actor parameters.
[processExpData.Actor,processExpData.ActorOptimizer] = update(...
    processExpData.ActorOptimizer,processExpData.Actor,...
    actorGradient);
actorParams = getLearnableParameters(processExpData.Actor);
% Update targets using the given TargetSmoothFactor hyperparameter.
processExpData.TargetCritic = syncParameters(processExpData.TargetCritic,...
    processExpData.Critic,processExpData.TargetSmoothFactor);
processExpData.TargetActor  = syncParameters(processExpData.TargetActor ,...
    processExpData.Actor ,processExpData.TargetSmoothFactor);
end
function loss = deepCriticLoss(q,targetq)
q = q{1};
% Loss is the half mean-square error of q = Q(observation,action)
%against  qtarget
loss = mse(q,reshape(targetq,size(q)));
end
function dQdTheta = deepActorGradient(actorNet,observation,gradData)
% Evaluate actions from current observations.
action = forward(actorNet,observation{:});
% Compute: q = Q(s,a)
q = predict(gradData.CriticNet,observation{:},action);
% Compute: qsum = -sum(q)/N to maximize q
qsum = -sum(q,"all")/gradData.MiniBatchSize;
% Compute: d(-sum(q)/N)/dActorParams
dQdTheta = dlgradient(qsum,actorNet.Learnables);
end
function [trainingPlot, lineReward, lineAveReward] = hBuildFigure()
    plotRatio = 16/9;
    trainingPlot = figure(...
                'Visible','off',...
                'HandleVisibility','off', ...
                'NumberTitle','off',...
                'Name','Cart Pole Custom Training');
    trainingPlot.Position(3) = plotRatio * trainingPlot.Position(4);
    ax = gca(trainingPlot);
    lineReward = animatedline(ax);
    lineAveReward = animatedline(ax,'Color','r','LineWidth',3);
    xlabel(ax,'Episode');
    ylabel(ax,'Reward');
    legend(ax,'Cumulative Reward','Average Reward','Location','northwest')
    title(ax,'Training Progress');
end
Operation terminated by user during deep.internal.recording.convert.tapeToFunction
In deep.AcceleratedFunction>iGenerateBackwardFunctionNoCleanup (line 637)
[backwardFun, backwardFileName] = deep.internal.recording.convert.tapeToFunction(tape, backwardInputIDs, gradIDs);
In deep.AcceleratedFunction>iGenerateBackwardFunction (line 603)
[backwardFun, backwardFileName] = iGenerateBackwardFunctionNoCleanup(args,numIntermediateAdjointsToDrop);
In deep.AcceleratedFunction/augmentWithBackwardFunctions (line 467)
                [fullBackwardFun, fullBackwardFileName] = iGenerateBackwardFunction(args, 0);
In deep.AcceleratedFunction/generateForward (line 442)
                    fun = augmentWithBackwardFunctions(obj, args, numIntermediates, generatedCode);
In  ()  (line 262)
            [cacheData, varargout, illegalOutputs] = generateForward(obj, varargout, inputNodes, tm, priorTapeCount, isTracing);
In nnet.internal.cnn.layer.CodegenFusedLayer/evaluate (line 153)
                [Z{1:nout}] = trainingFun(X, this.Learnables, this.State);
In nnet.internal.cnn.layer.CodegenFusedLayer/predict (line 75)
            [varargout{1:nargout}] = evaluate(this, X, @predictPropagate, this.PredictTrainingFcn, this.PredictInferenceCache);
In nnet.internal.cnn.layer.GraphExecutor>iPredictWithoutState (line 407)
    out = predict(layer, in);
0 Comments
Answers (0)
See Also
Categories
				Find more on Training and Simulation in Help Center and File Exchange
			
	Products
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!