Ucto Tokeniser

<?xml version="1.0" encoding="UTF-8"?>
<cmd:CMD xmlns:cmd="http://www.clarin.eu/cmd/1"
         xmlns:cmdp="http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         CMDVersion="1.2"
         xsi:schemaLocation="http://www.clarin.eu/cmd/1 https://infra.clarin.eu/CMDI/1.x/xsd/cmd-envelop.xsd http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640 https://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/1.1/profiles/clarin.eu:cr1:p_1342181139640/1.2/xsd">
   <cmd:Header>
      <cmd:MdCreator>janodijk</cmd:MdCreator>
      <cmd:MdCreationDate>2018-08-01+02:00</cmd:MdCreationDate>
      <cmd:MdProfile>clarin.eu:cr1:p_1342181139640</cmd:MdProfile>
      <cmd:MdCollectionDisplayName>CLARIN Netherlands</cmd:MdCollectionDisplayName>
   </cmd:Header>
   <cmd:Resources>
      <cmd:ResourceProxyList>
		       <cmd:ResourceProxy id="UCTO001">
			         <cmd:ResourceType>Resource</cmd:ResourceType>
			         <cmd:ResourceRef>https://webservices-lst.science.ru.nl/ucto/</cmd:ResourceRef>
		       </cmd:ResourceProxy>
	     </cmd:ResourceProxyList>
      <cmd:JournalFileProxyList/>
      <cmd:ResourceRelationList/>
   </cmd:Resources>
   <cmd:Components>
      <cmdp:ClarinSoftwareDescription>
         <cmdp:GeneralInfo>
            <cmdp:name xml:lang="eng">Ucto</cmdp:name>
            <cmdp:title xml:lang="eng">Ucto Tokeniser</cmdp:title>
			         <cmdp:version>v0.13</cmdp:version>
            <cmdp:publicationYear>2011-03-27</cmdp:publicationYear>
            <cmdp:url>https://webservices-lst.science.ru.nl/ucto/</cmdp:url>
            <cmdp:CLARINCentre>none yet</cmdp:CLARINCentre>
            <cmdp:ReleaseStatus>
               <cmdp:LifeCycleStatus>published</cmdp:LifeCycleStatus>
               <cmdp:lastUpdate>2018-05-17</cmdp:lastUpdate>
            </cmdp:ReleaseStatus>
            <cmdp:NationalProjects>
               <cmdp:Project>
                  <cmdp:name>CLARIN-NL</cmdp:name>
                  <cmdp:title>CLARIN in the Netherlands</cmdp:title>
                  <cmdp:id>184.021.003</cmdp:id>
                  <cmdp:funder>NWO</cmdp:funder>
                  <cmdp:url>http://www.clarin.nl</cmdp:url>
                  <cmdp:Contact>
                     <cmdp:Person>Jan Odijk</cmdp:Person>
                     <cmdp:Role>National Coordinator</cmdp:Role>
                     <cmdp:Address>Utrecht, the Netherlands</cmdp:Address>
                     <cmdp:Email>j.odijk@uu.nl</cmdp:Email>
                     <cmdp:Department>UiL-OTS</cmdp:Department>
                     <cmdp:Organisation>Utrecht University</cmdp:Organisation>
                  </cmdp:Contact>
                  <cmdp:Duration>
                     <cmdp:StartYear>2009</cmdp:StartYear>
                     <cmdp:CompletionYear>2015</cmdp:CompletionYear>
                  </cmdp:Duration>
               </cmdp:Project>
               <cmdp:Project>
                  <cmdp:name>CLARIAH-CORE</cmdp:name>
                  <cmdp:title>Common Lab Research Infrastructure for the Arts and the Humanities</cmdp:title>
                  <cmdp:id>184.033.101</cmdp:id>
                  <cmdp:funder>NWO</cmdp:funder>
                  <cmdp:url>http://www.clariah.nl</cmdp:url>
                  <cmdp:Contact>
                     <cmdp:Person>Jan Odijk</cmdp:Person>
                     <cmdp:Role>National Coordinator</cmdp:Role>
                     <cmdp:Address>Utrecht, the Netherlands</cmdp:Address>
                     <cmdp:Email>j.odijk@uu.nl</cmdp:Email>
                     <cmdp:Department>UiL-OTS</cmdp:Department>
                     <cmdp:Organisation>Utrecht University</cmdp:Organisation>
                  </cmdp:Contact>
                  <cmdp:Duration>
                     <cmdp:StartYear>2015</cmdp:StartYear>
                     <cmdp:CompletionYear>2018</cmdp:CompletionYear>
                  </cmdp:Duration>
               </cmdp:Project>
            </cmdp:NationalProjects>
            <cmdp:Country>
               <cmdp:CountryName>Netherlands</cmdp:CountryName>
               <cmdp:CountryCoding>NL</cmdp:CountryCoding>
            </cmdp:Country>
            <cmdp:Description>
	              <cmdp:Description xml:lang="eng">Ucto tokenizes text files: it separates words from punctuation, and splits sentences. This is one of the first tasks for almost any Natural Language Processing application. Ucto offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation. The tokeniser engine is language independent. By supplying  language-specific tokenisation rules in an external configuration file a tokeniser can be created for a specific language. Ucto comes with tokenization rules for English, Dutch, French, Italian, and Swedish; it is easily extendible to other languages. It recognizes dates, times, units, currencies, abbreviations. It recognizes paired quote spans, sentences, and paragraphs. It produces UTF8 encoding and NFC output normalization, optionally accepts other encodings as input.  Optional conversion to all lowercase or uppercase. Ucto supports FoLiA XML.
		  </cmdp:Description>
            </cmdp:Description>
         </cmdp:GeneralInfo>
         <cmdp:SoftwareFunction>
            <cmdp:toolCategory>written language tool</cmdp:toolCategory>
			         <cmdp:ToolTasks>
	              <cmdp:toolTask>sentence splitting</cmdp:toolTask>
	              <cmdp:toolTask>tokenisation</cmdp:toolTask>
			         </cmdp:ToolTasks>
            <cmdp:ResearchPhases>
               <cmdp:ResearchPhase>Enriching Data</cmdp:ResearchPhase>
            </cmdp:ResearchPhases>
            <cmdp:ResearchDomains>
				           <cmdp:researchDomain>Linguistics</cmdp:researchDomain>
			         </cmdp:ResearchDomains>
            <cmdp:LinguisticsSubject>
               <cmdp:linguisticsSubject>general linguistics</cmdp:linguisticsSubject>
	              <cmdp:Description>
		                <cmdp:Description/>
	              </cmdp:Description>
            </cmdp:LinguisticsSubject>
            <cmdp:LinguisticsSubject>
               <cmdp:linguisticsSubject>syntax</cmdp:linguisticsSubject>
	              <cmdp:Description>
		                <cmdp:Description/>
	              </cmdp:Description>
            </cmdp:LinguisticsSubject>
            <cmdp:LanguageVariety>
               <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Dutch</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>nld</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Swedish</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>swe</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Russian</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>rus</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Spanish</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>spa</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Portuguese</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>por</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>English</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>German</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>deu</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>French</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>fra</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
            <cmdp:LanguageVariety>
			            <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Italian</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>ita</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
         </cmdp:SoftwareFunction>
         <cmdp:SoftwareImplementation>
            <cmdp:distributionMedium>Online available</cmdp:distributionMedium>
		          <cmdp:sourcecodeURI>https://github.com/LanguageMachines/ucto</cmdp:sourcecodeURI>
            <cmdp:InstallationRequirements>
               <cmdp:MinimumHardwareRequirements>
                  <cmdp:SystemRequirements>
                     <cmdp:workingMemoryMin>not specified</cmdp:workingMemoryMin>
                     <cmdp:hardDiskMin>not specified</cmdp:hardDiskMin>
                     <cmdp:Platform>
                        <cmdp:operatingSystem>POSIX</cmdp:operatingSystem>
						                  <cmdp:operatingSystemVersion>not specified</cmdp:operatingSystemVersion>
						                  <cmdp:bitArchitecture>unknown</cmdp:bitArchitecture>
                     </cmdp:Platform>
                  </cmdp:SystemRequirements>
               </cmdp:MinimumHardwareRequirements>
               <cmdp:SoftwareRequirements>
                  <cmdp:RequiredSoftware>
                     <cmdp:SoftwareShortDescription>
                        <cmdp:resourceName>icu</cmdp:resourceName>
						                  <cmdp:version>not specified</cmdp:version>
						                  <cmdp:url>http://site.icu-project.org/design/cpp</cmdp:url>
                        <cmdp:applicationType>localDesktop</cmdp:applicationType>
                     </cmdp:SoftwareShortDescription>
                     <cmdp:SoftwareShortDescription>
                        <cmdp:resourceName>libxml2</cmdp:resourceName>
						                  <cmdp:version>not specified</cmdp:version>
						                  <cmdp:url>https://pypi.org/project/libxml2-python/</cmdp:url>
                        <cmdp:applicationType>localDesktop</cmdp:applicationType>
                     </cmdp:SoftwareShortDescription>
                     <cmdp:SoftwareShortDescription>
                        <cmdp:resourceName>ticcutils</cmdp:resourceName>
						                  <cmdp:version>not specified</cmdp:version>
						                  <cmdp:url/>
                        <cmdp:applicationType>localDesktop</cmdp:applicationType>
                     </cmdp:SoftwareShortDescription>
                     <cmdp:SoftwareShortDescription>
                        <cmdp:resourceName>libfolia</cmdp:resourceName>
						                  <cmdp:version>not specified</cmdp:version>
						                  <cmdp:url>https://github.com/LanguageMachines/libfolia</cmdp:url>
                        <cmdp:applicationType>localDesktop</cmdp:applicationType>
                     </cmdp:SoftwareShortDescription>
                  </cmdp:RequiredSoftware>
               </cmdp:SoftwareRequirements>
            </cmdp:InstallationRequirements>
            <cmdp:UserInterface>
               <cmdp:interfaceType>command line interface</cmdp:interfaceType>
               <cmdp:applicationType>local desktop</cmdp:applicationType>
            </cmdp:UserInterface>
            <cmdp:UserInterface>
               <cmdp:interfaceType>graphical user interface</cmdp:interfaceType>
               <cmdp:applicationType>web application</cmdp:applicationType>
            </cmdp:UserInterface>
            <cmdp:UserInterface>
               <cmdp:interfaceType>web interface</cmdp:interfaceType>
               <cmdp:applicationType>web service</cmdp:applicationType>
            </cmdp:UserInterface>
		          <cmdp:Input>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>PDF</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>application/pdf</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>MS-Word</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>application/msword</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>FoLiA</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/folia+xml</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>ISO-8859-1</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>ISO 8859-15</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Token per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>FoLiA</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/folia+xml</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>

				        </cmdp:Output>

         </cmdp:SoftwareImplementation>
         <cmdp:Access>
            <cmdp:ResourceLicense>
               <cmdp:license>GNU GPL</cmdp:license>
			            <cmdp:version>3.0</cmdp:version>
               <cmdp:distributionType>public</cmdp:distributionType>
               <cmdp:url>https://spdx.org/licenses/GPL-3.0</cmdp:url>
               <cmdp:Price>
                  <cmdp:amount>0</cmdp:amount>
                  <cmdp:ISO4217>
                     <cmdp:iso-4217-currency>EUR</cmdp:iso-4217-currency>
                  </cmdp:ISO4217>
               </cmdp:Price>
            </cmdp:ResourceLicense>
               <cmdp:Contact>
                  <cmdp:Person>
				  Antal van den Bosch
                  </cmdp:Person>
				           <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  a.vandenbosch@let.ru.nl
                  </cmdp:Email>
				           <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>

         </cmdp:Access>
         <cmdp:ResourceDocumentation>
            <cmdp:Documentation>
               <cmdp:title>Maarten van Gompel, Ko van der Sloot and Antal van den Bosch. 2017. Ucto: Unicode Tokenizer. version 0.9.6. Reference Guide. Technical Report, Jan 23, 2017.</cmdp:title>
               <cmdp:documentationTarget>technical</cmdp:documentationTarget>
               <cmdp:url>https://raw.githubusercontent.com/proycon/ucto/master/docs/ucto_manual.pdf</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
			         </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>Maarten van Gompel, Ko van der Sloot, Antal van den Bosch (2012). Ucto: Unicode Tokeniser. Reference Guide. ILK Technical Report 12-05.</cmdp:title>
               <cmdp:documentationTarget>technical</cmdp:documentationTarget>
               <cmdp:url>http://ilk.uvt.nl/downloads/pub/papers/ilk.1205.pdf</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>readme</cmdp:title>
               <cmdp:documentationTarget>user</cmdp:documentationTarget>
               <cmdp:url>https://github.com/LanguageMachines/ucto/blob/master/README.md</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>releaseNotes</cmdp:title>
               <cmdp:documentationTarget>user</cmdp:documentationTarget>
               <cmdp:url>https://github.com/LanguageMachines/ucto/releases</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>issueTracker</cmdp:title>
               <cmdp:documentationTarget>technical</cmdp:documentationTarget>
               <cmdp:url>https://github.com/LanguageMachines/ucto/issues</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>contIntegration</cmdp:title>
               <cmdp:documentationTarget>technical</cmdp:documentationTarget>
               <cmdp:url>https://travis-ci.org/LanguageMachines/ucto</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
		
			

		          <cmdp:Pictures> 
			            <cmdp:picture type="logo">https://raw.githubusercontent.com/LanguageMachines/ucto/master/logo.svg</cmdp:picture>
		          </cmdp:Pictures>
         </cmdp:ResourceDocumentation>
         <cmdp:SoftwareDevelopment>
             <cmdp:Project>
               <cmdp:name>CLARIN-NL</cmdp:name>
               <cmdp:title/>
               <cmdp:funder>NWO</cmdp:funder>
               <cmdp:url/>
               <cmdp:Contact>
		                <cmdp:Person/>
		                <cmdp:Email/>
		                <cmdp:Organisation xml:lang="eng"/>
	              </cmdp:Contact>
               <cmdp:Duration/>
            </cmdp:Project>
            <cmdp:Project>
               <cmdp:name>CLARIAH-CORE</cmdp:name>
               <cmdp:title/>
               <cmdp:funder>NWO</cmdp:funder>
               <cmdp:url/>
               <cmdp:Contact>
		                <cmdp:Person/>
		                <cmdp:Email/>
		                <cmdp:Organisation xml:lang="eng"/>
	              </cmdp:Contact>
               <cmdp:Duration/>
            </cmdp:Project>
            <cmdp:Creator>
               <cmdp:Contact>
		                <cmdp:Person>Antal van den Bosch</cmdp:Person>
		                <cmdp:Email/>
		                <cmdp:Organisation xml:lang="eng"/>
	              </cmdp:Contact>
		          </cmdp:Creator> 
            <cmdp:Creator>
               <cmdp:Role>
			   project lead
               </cmdp:Role>
               <cmdp:Contact>
                  <cmdp:Person>
				  Antal van den Bosch
                  </cmdp:Person>
				              <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  a.vandenbosch@let.ru.nl
                  </cmdp:Email>
				              <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>
            </cmdp:Creator>
            <cmdp:Creator>
               <cmdp:Role>
			   software developer
               </cmdp:Role>
               <cmdp:Contact>
                  <cmdp:Person>
				  Maarten van Gompel
                  </cmdp:Person>
				              <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  proycon@anaproy.nl
                  </cmdp:Email>
				              <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>
            </cmdp:Creator>
            <cmdp:Creator>
               <cmdp:Role>
			   software developer
               </cmdp:Role>
               <cmdp:Contact>
                  <cmdp:Person>
				  Ko van der Sloot
                  </cmdp:Person>
				              <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
 				             <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                 <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>
            </cmdp:Creator>
			
         </cmdp:SoftwareDevelopment>
         <cmdp:TechnicalInfo>
            <cmdp:ImplementationLanguage>
               <cmdp:implementationLanguage>C++</cmdp:implementationLanguage>
               <cmdp:version>unknown</cmdp:version>
            </cmdp:ImplementationLanguage>
         </cmdp:TechnicalInfo>
	        <cmdp:LRS>
	           <cmdp:Authentication>Yes. Before tool use, please register at https://webservices-lst.science.ru.nl/register.</cmdp:Authentication>
		          <cmdp:Description>
               <cmdp:Description>Ucto</cmdp:Description>
            </cmdp:Description>
		          <cmdp:ToolTasks>
			            <cmdp:toolTask>sentence splitting</cmdp:toolTask>
			            <cmdp:toolTask>tokenisation</cmdp:toolTask>
		          </cmdp:ToolTasks>
		          <cmdp:Input>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>PDF</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>application/pdf</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>MS-Word</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>application/msword</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>ISO-8859-1</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
		             <cmdp:characterEncoding>ISO 8859-15</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Token per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>FoLiA</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/xml</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>

				        </cmdp:Output>
		
		          <cmdp:ActualParameters><!--0-1 -->
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>project</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>new</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>input</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>self.linkToResource</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>lang</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>self.linkToResourceLanguage</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
		          </cmdp:ActualParameters>
		          <cmdp:LRSMapping>
		             <cmdp:LRSParameterName>input</cmdp:LRSParameterName>
		             <cmdp:ActualParameterName>untokinput_url</cmdp:ActualParameterName>
		          </cmdp:LRSMapping>
		          <cmdp:LRSMapping>
		             <cmdp:LRSParameterName>lang</cmdp:LRSParameterName>
		             <cmdp:ActualParameterName>untokinput_language</cmdp:ActualParameterName>
		          </cmdp:LRSMapping>
	        </cmdp:LRS>
	        <cmdp:LRS>
	           <cmdp:Authentication>Yes. Before tool use, please register at https://webservices-lst.science.ru.nl/register.</cmdp:Authentication>
		          <cmdp:Description>
               <cmdp:Description>Ucto</cmdp:Description>
            </cmdp:Description>
		          <cmdp:ToolTasks>
			            <cmdp:toolTask>sentence splitting</cmdp:toolTask>
			            <cmdp:toolTask>tokenisation</cmdp:toolTask>
		          </cmdp:ToolTasks>
		          <cmdp:Input>
		             <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
			            <cmdp:inputType>text</cmdp:inputType>
			            <cmdp:Schema>
                  <cmdp:schemaname>FoLiA</cmdp:schemaname>
               </cmdp:Schema>
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/folia+xml</cmdp:MimeType>
               </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Sentence per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>One Token per Line</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
                <cmdp:Schema>
                  <cmdp:schemaname>FoLiA</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/folia+xml</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Discourse/Sentence Boundaries</cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>

				        </cmdp:Output>
		
		          <cmdp:ActualParameters><!--0-1 -->
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>project</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>new</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>input</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>self.linkToResource</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
			            <cmdp:ActualParameter><!--1 - unbounded -->
				              <cmdp:ActualParameterName>lang</cmdp:ActualParameterName>
				              <cmdp:ActualParameterValue>self.linkToResourceLanguage</cmdp:ActualParameterValue>
			            </cmdp:ActualParameter>
		          </cmdp:ActualParameters>
		          <cmdp:LRSMapping>
		             <cmdp:LRSParameterName>input</cmdp:LRSParameterName>
		             <cmdp:ActualParameterName>foliainput_url</cmdp:ActualParameterName>
		          </cmdp:LRSMapping>
		          <cmdp:LRSMapping>
		             <cmdp:LRSParameterName>lang</cmdp:LRSParameterName>
		             <cmdp:ActualParameterName>foliainput_language</cmdp:ActualParameterName>
		          </cmdp:LRSMapping>
	        </cmdp:LRS>
      </cmdp:ClarinSoftwareDescription>
   </cmd:Components>
</cmd:CMD>
Organisation:
  • Utrecht University
  • Radboud University Nijmegen

Resources:

Resource

application/pdf