Ucto Tokeniser
<?xml version="1.0" encoding="UTF-8"?>
<cmd:CMD xmlns:cmd="http://www.clarin.eu/cmd/1"
xmlns:cmdp="http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
CMDVersion="1.2"
xsi:schemaLocation="http://www.clarin.eu/cmd/1 https://infra.clarin.eu/CMDI/1.x/xsd/cmd-envelop.xsd http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640 https://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/1.1/profiles/clarin.eu:cr1:p_1342181139640/1.2/xsd">
<cmd:Header>
<cmd:MdCreator>janodijk</cmd:MdCreator>
<cmd:MdCreationDate>2018-08-01+02:00</cmd:MdCreationDate>
<cmd:MdProfile>clarin.eu:cr1:p_1342181139640</cmd:MdProfile>
<cmd:MdCollectionDisplayName>CLARIN Netherlands</cmd:MdCollectionDisplayName>
</cmd:Header>
<cmd:Resources>
<cmd:ResourceProxyList>
<cmd:ResourceProxy id="UCTO001">
<cmd:ResourceType>Resource</cmd:ResourceType>
<cmd:ResourceRef>https://webservices-lst.science.ru.nl/ucto/</cmd:ResourceRef>
</cmd:ResourceProxy>
</cmd:ResourceProxyList>
<cmd:JournalFileProxyList/>
<cmd:ResourceRelationList/>
</cmd:Resources>
<cmd:Components>
<cmdp:ClarinSoftwareDescription>
<cmdp:GeneralInfo>
<cmdp:name xml:lang="eng">Ucto</cmdp:name>
<cmdp:title xml:lang="eng">Ucto Tokeniser</cmdp:title>
<cmdp:version>v0.13</cmdp:version>
<cmdp:publicationYear>2011-03-27</cmdp:publicationYear>
<cmdp:url>https://webservices-lst.science.ru.nl/ucto/</cmdp:url>
<cmdp:CLARINCentre>none yet</cmdp:CLARINCentre>
<cmdp:ReleaseStatus>
<cmdp:LifeCycleStatus>published</cmdp:LifeCycleStatus>
<cmdp:lastUpdate>2018-05-17</cmdp:lastUpdate>
</cmdp:ReleaseStatus>
<cmdp:NationalProjects>
<cmdp:Project>
<cmdp:name>CLARIN-NL</cmdp:name>
<cmdp:title>CLARIN in the Netherlands</cmdp:title>
<cmdp:id>184.021.003</cmdp:id>
<cmdp:funder>NWO</cmdp:funder>
<cmdp:url>http://www.clarin.nl</cmdp:url>
<cmdp:Contact>
<cmdp:Person>Jan Odijk</cmdp:Person>
<cmdp:Role>National Coordinator</cmdp:Role>
<cmdp:Address>Utrecht, the Netherlands</cmdp:Address>
<cmdp:Email>j.odijk@uu.nl</cmdp:Email>
<cmdp:Department>UiL-OTS</cmdp:Department>
<cmdp:Organisation>Utrecht University</cmdp:Organisation>
</cmdp:Contact>
<cmdp:Duration>
<cmdp:StartYear>2009</cmdp:StartYear>
<cmdp:CompletionYear>2015</cmdp:CompletionYear>
</cmdp:Duration>
</cmdp:Project>
<cmdp:Project>
<cmdp:name>CLARIAH-CORE</cmdp:name>
<cmdp:title>Common Lab Research Infrastructure for the Arts and the Humanities</cmdp:title>
<cmdp:id>184.033.101</cmdp:id>
<cmdp:funder>NWO</cmdp:funder>
<cmdp:url>http://www.clariah.nl</cmdp:url>
<cmdp:Contact>
<cmdp:Person>Jan Odijk</cmdp:Person>
<cmdp:Role>National Coordinator</cmdp:Role>
<cmdp:Address>Utrecht, the Netherlands</cmdp:Address>
<cmdp:Email>j.odijk@uu.nl</cmdp:Email>
<cmdp:Department>UiL-OTS</cmdp:Department>
<cmdp:Organisation>Utrecht University</cmdp:Organisation>
</cmdp:Contact>
<cmdp:Duration>
<cmdp:StartYear>2015</cmdp:StartYear>
<cmdp:CompletionYear>2018</cmdp:CompletionYear>
</cmdp:Duration>
</cmdp:Project>
</cmdp:NationalProjects>
<cmdp:Country>
<cmdp:CountryName>Netherlands</cmdp:CountryName>
<cmdp:CountryCoding>NL</cmdp:CountryCoding>
</cmdp:Country>
<cmdp:Description>
<cmdp:Description xml:lang="eng">Ucto tokenizes text files: it separates words from punctuation, and splits sentences. This is one of the first tasks for almost any Natural Language Processing application. Ucto offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation. The tokeniser engine is language independent. By supplying language-specific tokenisation rules in an external configuration file a tokeniser can be created for a specific language. Ucto comes with tokenization rules for English, Dutch, French, Italian, and Swedish; it is easily extendible to other languages. It recognizes dates, times, units, currencies, abbreviations. It recognizes paired quote spans, sentences, and paragraphs. It produces UTF8 encoding and NFC output normalization, optionally accepts other encodings as input. Optional conversion to all lowercase or uppercase. Ucto supports FoLiA XML.
</cmdp:Description>
</cmdp:Description>
</cmdp:GeneralInfo>
<cmdp:SoftwareFunction>
<cmdp:toolCategory>written language tool</cmdp:toolCategory>
<cmdp:ToolTasks>
<cmdp:toolTask>sentence splitting</cmdp:toolTask>
<cmdp:toolTask>tokenisation</cmdp:toolTask>
</cmdp:ToolTasks>
<cmdp:ResearchPhases>
<cmdp:ResearchPhase>Enriching Data</cmdp:ResearchPhase>
</cmdp:ResearchPhases>
<cmdp:ResearchDomains>
<cmdp:researchDomain>Linguistics</cmdp:researchDomain>
</cmdp:ResearchDomains>
<cmdp:LinguisticsSubject>
<cmdp:linguisticsSubject>general linguistics</cmdp:linguisticsSubject>
<cmdp:Description>
<cmdp:Description/>
</cmdp:Description>
</cmdp:LinguisticsSubject>
<cmdp:LinguisticsSubject>
<cmdp:linguisticsSubject>syntax</cmdp:linguisticsSubject>
<cmdp:Description>
<cmdp:Description/>
</cmdp:Description>
</cmdp:LinguisticsSubject>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Dutch</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>nld</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Swedish</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>swe</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Russian</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>rus</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Spanish</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>spa</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Portuguese</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>por</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>English</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>German</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>deu</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>French</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>fra</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
<cmdp:LanguageVariety>
<cmdp:languageDependent>yes</cmdp:languageDependent>
<cmdp:Language>
<cmdp:LanguageName>Italian</cmdp:LanguageName>
<cmdp:ISO639>
<cmdp:iso-639-3-code>ita</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Language>
<cmdp:Centuries>
<cmdp:centuryDependent>yes</cmdp:centuryDependent>
<cmdp:CenturyInterval>
<cmdp:centuryFrom>20</cmdp:centuryFrom>
<cmdp:centuryThrough>21</cmdp:centuryThrough>
</cmdp:CenturyInterval>
</cmdp:Centuries>
</cmdp:LanguageVariety>
</cmdp:SoftwareFunction>
<cmdp:SoftwareImplementation>
<cmdp:distributionMedium>Online available</cmdp:distributionMedium>
<cmdp:sourcecodeURI>https://github.com/LanguageMachines/ucto</cmdp:sourcecodeURI>
<cmdp:InstallationRequirements>
<cmdp:MinimumHardwareRequirements>
<cmdp:SystemRequirements>
<cmdp:workingMemoryMin>not specified</cmdp:workingMemoryMin>
<cmdp:hardDiskMin>not specified</cmdp:hardDiskMin>
<cmdp:Platform>
<cmdp:operatingSystem>POSIX</cmdp:operatingSystem>
<cmdp:operatingSystemVersion>not specified</cmdp:operatingSystemVersion>
<cmdp:bitArchitecture>unknown</cmdp:bitArchitecture>
</cmdp:Platform>
</cmdp:SystemRequirements>
</cmdp:MinimumHardwareRequirements>
<cmdp:SoftwareRequirements>
<cmdp:RequiredSoftware>
<cmdp:SoftwareShortDescription>
<cmdp:resourceName>icu</cmdp:resourceName>
<cmdp:version>not specified</cmdp:version>
<cmdp:url>http://site.icu-project.org/design/cpp</cmdp:url>
<cmdp:applicationType>localDesktop</cmdp:applicationType>
</cmdp:SoftwareShortDescription>
<cmdp:SoftwareShortDescription>
<cmdp:resourceName>libxml2</cmdp:resourceName>
<cmdp:version>not specified</cmdp:version>
<cmdp:url>https://pypi.org/project/libxml2-python/</cmdp:url>
<cmdp:applicationType>localDesktop</cmdp:applicationType>
</cmdp:SoftwareShortDescription>
<cmdp:SoftwareShortDescription>
<cmdp:resourceName>ticcutils</cmdp:resourceName>
<cmdp:version>not specified</cmdp:version>
<cmdp:url/>
<cmdp:applicationType>localDesktop</cmdp:applicationType>
</cmdp:SoftwareShortDescription>
<cmdp:SoftwareShortDescription>
<cmdp:resourceName>libfolia</cmdp:resourceName>
<cmdp:version>not specified</cmdp:version>
<cmdp:url>https://github.com/LanguageMachines/libfolia</cmdp:url>
<cmdp:applicationType>localDesktop</cmdp:applicationType>
</cmdp:SoftwareShortDescription>
</cmdp:RequiredSoftware>
</cmdp:SoftwareRequirements>
</cmdp:InstallationRequirements>
<cmdp:UserInterface>
<cmdp:interfaceType>command line interface</cmdp:interfaceType>
<cmdp:applicationType>local desktop</cmdp:applicationType>
</cmdp:UserInterface>
<cmdp:UserInterface>
<cmdp:interfaceType>graphical user interface</cmdp:interfaceType>
<cmdp:applicationType>web application</cmdp:applicationType>
</cmdp:UserInterface>
<cmdp:UserInterface>
<cmdp:interfaceType>web interface</cmdp:interfaceType>
<cmdp:applicationType>web service</cmdp:applicationType>
</cmdp:UserInterface>
<cmdp:Input>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>PDF</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>application/pdf</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>MS-Word</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>application/msword</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>FoLiA</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/folia+xml</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>ISO-8859-1</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>ISO 8859-15</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Token per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>FoLiA</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/folia+xml</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
</cmdp:SoftwareImplementation>
<cmdp:Access>
<cmdp:ResourceLicense>
<cmdp:license>GNU GPL</cmdp:license>
<cmdp:version>3.0</cmdp:version>
<cmdp:distributionType>public</cmdp:distributionType>
<cmdp:url>https://spdx.org/licenses/GPL-3.0</cmdp:url>
<cmdp:Price>
<cmdp:amount>0</cmdp:amount>
<cmdp:ISO4217>
<cmdp:iso-4217-currency>EUR</cmdp:iso-4217-currency>
</cmdp:ISO4217>
</cmdp:Price>
</cmdp:ResourceLicense>
<cmdp:Contact>
<cmdp:Person>
Antal van den Bosch
</cmdp:Person>
<cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
<cmdp:Email>
a.vandenbosch@let.ru.nl
</cmdp:Email>
<cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
<cmdp:Organisation>
Radboud University Nijmegen
</cmdp:Organisation>
<cmdp:Url>
https://www.ru.nl/clst/
</cmdp:Url>
</cmdp:Contact>
</cmdp:Access>
<cmdp:ResourceDocumentation>
<cmdp:Documentation>
<cmdp:title>Maarten van Gompel, Ko van der Sloot and Antal van den Bosch. 2017. Ucto: Unicode Tokenizer. version 0.9.6. Reference Guide. Technical Report, Jan 23, 2017.</cmdp:title>
<cmdp:documentationTarget>technical</cmdp:documentationTarget>
<cmdp:url>https://raw.githubusercontent.com/proycon/ucto/master/docs/ucto_manual.pdf</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Documentation>
<cmdp:title>Maarten van Gompel, Ko van der Sloot, Antal van den Bosch (2012). Ucto: Unicode Tokeniser. Reference Guide. ILK Technical Report 12-05.</cmdp:title>
<cmdp:documentationTarget>technical</cmdp:documentationTarget>
<cmdp:url>http://ilk.uvt.nl/downloads/pub/papers/ilk.1205.pdf</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Documentation>
<cmdp:title>readme</cmdp:title>
<cmdp:documentationTarget>user</cmdp:documentationTarget>
<cmdp:url>https://github.com/LanguageMachines/ucto/blob/master/README.md</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Documentation>
<cmdp:title>releaseNotes</cmdp:title>
<cmdp:documentationTarget>user</cmdp:documentationTarget>
<cmdp:url>https://github.com/LanguageMachines/ucto/releases</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Documentation>
<cmdp:title>issueTracker</cmdp:title>
<cmdp:documentationTarget>technical</cmdp:documentationTarget>
<cmdp:url>https://github.com/LanguageMachines/ucto/issues</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Documentation>
<cmdp:title>contIntegration</cmdp:title>
<cmdp:documentationTarget>technical</cmdp:documentationTarget>
<cmdp:url>https://travis-ci.org/LanguageMachines/ucto</cmdp:url>
<cmdp:ISO639>
<cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
</cmdp:ISO639>
</cmdp:Documentation>
<cmdp:Pictures>
<cmdp:picture type="logo">https://raw.githubusercontent.com/LanguageMachines/ucto/master/logo.svg</cmdp:picture>
</cmdp:Pictures>
</cmdp:ResourceDocumentation>
<cmdp:SoftwareDevelopment>
<cmdp:Project>
<cmdp:name>CLARIN-NL</cmdp:name>
<cmdp:title/>
<cmdp:funder>NWO</cmdp:funder>
<cmdp:url/>
<cmdp:Contact>
<cmdp:Person/>
<cmdp:Email/>
<cmdp:Organisation xml:lang="eng"/>
</cmdp:Contact>
<cmdp:Duration/>
</cmdp:Project>
<cmdp:Project>
<cmdp:name>CLARIAH-CORE</cmdp:name>
<cmdp:title/>
<cmdp:funder>NWO</cmdp:funder>
<cmdp:url/>
<cmdp:Contact>
<cmdp:Person/>
<cmdp:Email/>
<cmdp:Organisation xml:lang="eng"/>
</cmdp:Contact>
<cmdp:Duration/>
</cmdp:Project>
<cmdp:Creator>
<cmdp:Contact>
<cmdp:Person>Antal van den Bosch</cmdp:Person>
<cmdp:Email/>
<cmdp:Organisation xml:lang="eng"/>
</cmdp:Contact>
</cmdp:Creator>
<cmdp:Creator>
<cmdp:Role>
project lead
</cmdp:Role>
<cmdp:Contact>
<cmdp:Person>
Antal van den Bosch
</cmdp:Person>
<cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
<cmdp:Email>
a.vandenbosch@let.ru.nl
</cmdp:Email>
<cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
<cmdp:Organisation>
Radboud University Nijmegen
</cmdp:Organisation>
<cmdp:Url>
https://www.ru.nl/clst/
</cmdp:Url>
</cmdp:Contact>
</cmdp:Creator>
<cmdp:Creator>
<cmdp:Role>
software developer
</cmdp:Role>
<cmdp:Contact>
<cmdp:Person>
Maarten van Gompel
</cmdp:Person>
<cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
<cmdp:Email>
proycon@anaproy.nl
</cmdp:Email>
<cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
<cmdp:Organisation>
Radboud University Nijmegen
</cmdp:Organisation>
<cmdp:Url>
https://www.ru.nl/clst/
</cmdp:Url>
</cmdp:Contact>
</cmdp:Creator>
<cmdp:Creator>
<cmdp:Role>
software developer
</cmdp:Role>
<cmdp:Contact>
<cmdp:Person>
Ko van der Sloot
</cmdp:Person>
<cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
<cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
<cmdp:Organisation>
Radboud University Nijmegen
</cmdp:Organisation>
<cmdp:Url>
https://www.ru.nl/clst/
</cmdp:Url>
</cmdp:Contact>
</cmdp:Creator>
</cmdp:SoftwareDevelopment>
<cmdp:TechnicalInfo>
<cmdp:ImplementationLanguage>
<cmdp:implementationLanguage>C++</cmdp:implementationLanguage>
<cmdp:version>unknown</cmdp:version>
</cmdp:ImplementationLanguage>
</cmdp:TechnicalInfo>
<cmdp:LRS>
<cmdp:Authentication>Yes. Before tool use, please register at https://webservices-lst.science.ru.nl/register.</cmdp:Authentication>
<cmdp:Description>
<cmdp:Description>Ucto</cmdp:Description>
</cmdp:Description>
<cmdp:ToolTasks>
<cmdp:toolTask>sentence splitting</cmdp:toolTask>
<cmdp:toolTask>tokenisation</cmdp:toolTask>
</cmdp:ToolTasks>
<cmdp:Input>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>PDF</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>application/pdf</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>MS-Word</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>application/msword</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>ISO-8859-1</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Input>
<cmdp:characterEncoding>ISO 8859-15</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Token per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>FoLiA</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/xml</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:ActualParameters><!--0-1 -->
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>project</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>new</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>input</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>self.linkToResource</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>lang</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>self.linkToResourceLanguage</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
</cmdp:ActualParameters>
<cmdp:LRSMapping>
<cmdp:LRSParameterName>input</cmdp:LRSParameterName>
<cmdp:ActualParameterName>untokinput_url</cmdp:ActualParameterName>
</cmdp:LRSMapping>
<cmdp:LRSMapping>
<cmdp:LRSParameterName>lang</cmdp:LRSParameterName>
<cmdp:ActualParameterName>untokinput_language</cmdp:ActualParameterName>
</cmdp:LRSMapping>
</cmdp:LRS>
<cmdp:LRS>
<cmdp:Authentication>Yes. Before tool use, please register at https://webservices-lst.science.ru.nl/register.</cmdp:Authentication>
<cmdp:Description>
<cmdp:Description>Ucto</cmdp:Description>
</cmdp:Description>
<cmdp:ToolTasks>
<cmdp:toolTask>sentence splitting</cmdp:toolTask>
<cmdp:toolTask>tokenisation</cmdp:toolTask>
</cmdp:ToolTasks>
<cmdp:Input>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:inputType>text</cmdp:inputType>
<cmdp:Schema>
<cmdp:schemaname>FoLiA</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/folia+xml</cmdp:MimeType>
</cmdp:MimeType>
</cmdp:Input>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>One Token per Line</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/plain</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:Output>
<cmdp:outputType>text</cmdp:outputType>
<cmdp:characterEncoding>utf8</cmdp:characterEncoding>
<cmdp:Schema>
<cmdp:schemaname>FoLiA</cmdp:schemaname>
</cmdp:Schema>
<cmdp:MimeType>
<cmdp:MimeType>text/folia+xml</cmdp:MimeType>
</cmdp:MimeType>
<cmdp:AnnotationType>
<cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
<cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
</cmdp:AnnotationType>
</cmdp:Output>
<cmdp:ActualParameters><!--0-1 -->
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>project</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>new</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>input</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>self.linkToResource</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
<cmdp:ActualParameter><!--1 - unbounded -->
<cmdp:ActualParameterName>lang</cmdp:ActualParameterName>
<cmdp:ActualParameterValue>self.linkToResourceLanguage</cmdp:ActualParameterValue>
</cmdp:ActualParameter>
</cmdp:ActualParameters>
<cmdp:LRSMapping>
<cmdp:LRSParameterName>input</cmdp:LRSParameterName>
<cmdp:ActualParameterName>foliainput_url</cmdp:ActualParameterName>
</cmdp:LRSMapping>
<cmdp:LRSMapping>
<cmdp:LRSParameterName>lang</cmdp:LRSParameterName>
<cmdp:ActualParameterName>foliainput_language</cmdp:ActualParameterName>
</cmdp:LRSMapping>
</cmdp:LRS>
</cmdp:ClarinSoftwareDescription>
</cmd:Components>
</cmd:CMD>
Organisation:
- Utrecht University
- Radboud University Nijmegen