<efrbr:recordSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:efrbr="http://vfrbr.info/efrbr/1.1" xmlns:efrbr-work="http://vfrbr.info/efrbr/1.1/work" xmlns:efrbr-expression="http://vfrbr.info/efrbr/1.1/expression" xmlns:efrbr-manifestation="http://vfrbr.info/efrbr/1.1/manifestation" xmlns:efrbr-person="http://vfrbr.info/efrbr/1.1/person" xmlns:efrbr-corporateBody="http://vfrbr.info/efrbr/1.1/corporateBody" xmlns:efrbr-concept="http://vfrbr.info/efrbr/1.1/concept" xmlns:efrbr-structure="http://vfrbr.info/efrbr/1.1/structure" xmlns:efrbr-responsible="http://vfrbr.info/efrbr/1.1/responsible" xmlns:efrbr-subject="http://vfrbr.info/efrbr/1.1/subject" xmlns:efrbr-other="http://vfrbr.info/efrbr/1.1/other" xsi:schemaLocation="http://vfrbr.info/efrbr/1.1 http://vfrbr.info/schemas/1.1/efrbr.xsd"><efrbr:entities><efrbr-work:work identifier="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE"><efrbr-work:titleOfTheWork>Model–free least–squares policy iteration</efrbr-work:titleOfTheWork></efrbr-work:work><efrbr-expression:expression identifier="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE"><efrbr-expression:titleOfTheExpression>Model–free least–squares policy iteration</efrbr-expression:titleOfTheExpression><efrbr-expression:formOfExpression vocabulary="DIAS:TYPES">
            Πλήρης Δημοσίευση σε Συνέδριο
            Conference Full Paper
         </efrbr-expression:formOfExpression><efrbr-expression:dateOfExpression type="issued">2015-11-14</efrbr-expression:dateOfExpression><efrbr-expression:dateOfExpression type="published">2001</efrbr-expression:dateOfExpression><efrbr-expression:languageOfExpression vocabulary="iso639-1">en</efrbr-expression:languageOfExpression><efrbr-expression:summarizationOfContent>We propose a new approach to reinforcement learning which combines
least squares function approximation with policy iteration. Our
method is model-free and completely o policy. We are motivated
by the least squares temporal dierence learning algorithm (LSTD),
which is known for its ecient use of sample experiences compared
to pure temporal dierence algorithms. LSTD is ideal for prediction
problems, however it heretofore has not had a straightforward application
to control problems. Moreover, approximations learned by LSTD
are strongly in
uenced by the visitation distribution over states. Our
new algorithm, Least-Squares Policy Iteration (LSPI) addresses these
issues. The result is an o-policy method which can use (or reuse)
data collected from any source. We test LSPI on several problems,
including a bicycle simulator in which it learns to guide the bicycle
to a goal eciently by merely observing a relatively small number of
completely random trials.
</efrbr-expression:summarizationOfContent><efrbr-expression:useRestrictionsOnTheExpression type="creative-commons">http://creativecommons.org/licenses/by/4.0/</efrbr-expression:useRestrictionsOnTheExpression><efrbr-expression:note type="page range">1547–1554</efrbr-expression:note><efrbr-expression:note type="conference name">Neural Information Processing Systems</efrbr-expression:note><efrbr-expression:note type="proceedings title">Proceedings of NIPS*2001: Neural Information Processing Systems, Vancouver, BC, Canada</efrbr-expression:note></efrbr-expression:expression><efrbr-person:person identifier="http://users.isc.tuc.gr/~lagoudakis"><efrbr-person:nameOfPerson vocabulary="TUC:LDAP">
            Lagoudakis Michael
            Λαγουδακης Μιχαηλ
         </efrbr-person:nameOfPerson></efrbr-person:person><efrbr-person:person identifier="B502CEBB-00C4-4710-AE87-9BEFB95676D9"><efrbr-person:nameOfPerson vocabulary="">
            Parr, R.
         </efrbr-person:nameOfPerson></efrbr-person:person><efrbr-concept:concept identifier="D3362B83-B626-4811-9B98-C7DE2D9669AB"><efrbr-concept:termForTheConcept>
             Artificial Intelligence
         </efrbr-concept:termForTheConcept></efrbr-concept:concept></efrbr:entities><efrbr:relationships><efrbr-structure:structureRelations><efrbr-structure:realizedThrough sourceEntity="work" sourceURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE" targetEntity="expression" targetURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE"/></efrbr-structure:structureRelations><efrbr-responsible:responsibleRelations><efrbr-responsible:createdBy sourceEntity="work" sourceURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE" targetEntity="person" targetURI="http://users.isc.tuc.gr/~lagoudakis"/><efrbr-responsible:realizedBy sourceEntity="expression" sourceURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE" targetEntity="person" targetURI="http://users.isc.tuc.gr/~lagoudakis" role="author"/><efrbr-responsible:realizedBy sourceEntity="expression" sourceURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE" targetEntity="person" targetURI="B502CEBB-00C4-4710-AE87-9BEFB95676D9" role="author"/></efrbr-responsible:responsibleRelations><efrbr-subject:subjectRelations><efrbr-subject:hasSubject sourceEntity="work" sourceURI="http://purl.tuc.gr/dl/dias/CDADBEEF-15F4-44B5-89B2-295FEC71FDAE" targetEntity="concept" targetURI="D3362B83-B626-4811-9B98-C7DE2D9669AB"/></efrbr-subject:subjectRelations><efrbr-other:otherRelations/></efrbr:relationships></efrbr:recordSet>