Automatic Transcription of Oral History Interviews

<?xml version="1.0" encoding="UTF-8"?>
<cmd:CMD xmlns:cmd="http://www.clarin.eu/cmd/1"
         xmlns:cmdp="http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         CMDVersion="1.2"
         xsi:schemaLocation="http://www.clarin.eu/cmd/1 https://infra.clarin.eu/CMDI/1.x/xsd/cmd-envelop.xsd http://www.clarin.eu/cmd/1/profiles/clarin.eu:cr1:p_1342181139640 https://catalog.clarin.eu/ds/ComponentRegistry/rest/registry/1.1/profiles/clarin.eu:cr1:p_1342181139640/1.2/xsd">
   <cmd:Header>
      <cmd:MdCreator>janodijk</cmd:MdCreator>
      <cmd:MdCreationDate>2018-08-01+02:00</cmd:MdCreationDate>
      <cmd:MdProfile>clarin.eu:cr1:p_1342181139640</cmd:MdProfile>
      <cmd:MdCollectionDisplayName>CLARIN Netherlands</cmd:MdCollectionDisplayName>
   </cmd:Header>
   <cmd:Resources>
      <cmd:ResourceProxyList>
		       <cmd:ResourceProxy id="OH001">
			         <cmd:ResourceType>Resource</cmd:ResourceType>
			         <cmd:ResourceRef>https://webservices-lst.science.ru.nl/oral_history</cmd:ResourceRef>
		       </cmd:ResourceProxy>
	     </cmd:ResourceProxyList>
      <cmd:JournalFileProxyList/>
      <cmd:ResourceRelationList/>
   </cmd:Resources>
   <cmd:Components>
      <cmdp:ClarinSoftwareDescription>
         <cmdp:GeneralInfo>
            <cmdp:name xml:lang="eng">Automatic Transcription of Oral History Interviews</cmdp:name>
            <cmdp:title xml:lang="eng">Automatic Transcription of Oral History Interviews</cmdp:title>
			         <cmdp:version>v0.1</cmdp:version>
            <cmdp:publicationYear>2017-04-02</cmdp:publicationYear>
            <cmdp:url>https://webservices-lst.science.ru.nl/oral_history</cmdp:url>
            <cmdp:CLARINCentre>none yet</cmdp:CLARINCentre>
            <cmdp:ReleaseStatus>
               <cmdp:LifeCycleStatus>published</cmdp:LifeCycleStatus>
               <cmdp:lastUpdate>2018-05-17</cmdp:lastUpdate>
			            <cmdp:version>v0.1</cmdp:version>
            </cmdp:ReleaseStatus>
            <cmdp:NationalProjects>
               <cmdp:Project>
                  <cmdp:name>CLARIAH-CORE</cmdp:name>
                  <cmdp:title>Common Lab Research Infrastructure for the Arts and the Humanities</cmdp:title>
                  <cmdp:id>184.033.101</cmdp:id>
                  <cmdp:funder>NWO</cmdp:funder>
                  <cmdp:url>http://www.clariah.nl</cmdp:url>
                  <cmdp:Contact>
                     <cmdp:Person>Jan Odijk</cmdp:Person>
                     <cmdp:Role>National Coordinator</cmdp:Role>
                     <cmdp:Address>Utrecht, the Netherlands</cmdp:Address>
                     <cmdp:Email>j.odijk@uu.nl</cmdp:Email>
                     <cmdp:Department>UiL-OTS</cmdp:Department>
                     <cmdp:Organisation>Utrecht University</cmdp:Organisation>
                  </cmdp:Contact>
                  <cmdp:Duration>
                     <cmdp:StartYear>2015</cmdp:StartYear>
                     <cmdp:CompletionYear>2018</cmdp:CompletionYear>
                  </cmdp:Duration>
               </cmdp:Project>
            </cmdp:NationalProjects>
            <cmdp:Country>
               <cmdp:CountryName>Netherlands</cmdp:CountryName>
               <cmdp:CountryCoding>NL</cmdp:CountryCoding>
            </cmdp:Country>
            <cmdp:Description>
	              <cmdp:Description xml:lang="eng">
		  This webservice and web application uses automatic speech recognition to provide the transcriptions of recordings spoken in Dutch. You can upload and process only one file per project. For bulk processing and other questions, please contact Henk van den Heuvel at h.vandenheuvel@let.ru.nl.
	      </cmdp:Description>
            </cmdp:Description>
         </cmdp:GeneralInfo>
         <cmdp:SoftwareFunction>
            <cmdp:toolCategory>spoken language tool</cmdp:toolCategory>
            <cmdp:toolCategory>mono-lingual tool</cmdp:toolCategory>
            <cmdp:ToolTasks>
				           <cmdp:toolTask>speech recognition</cmdp:toolTask>
				           <cmdp:toolTask>diarization</cmdp:toolTask>
			         </cmdp:ToolTasks>
			         <cmdp:ResearchPhases>
               <cmdp:ResearchPhase>Enriching Data</cmdp:ResearchPhase>
            </cmdp:ResearchPhases>
            <cmdp:ResearchDomains>
				           <cmdp:researchDomain>Oral History</cmdp:researchDomain>
				           <cmdp:researchDomain>Political Studies</cmdp:researchDomain>
			         </cmdp:ResearchDomains>
            <cmdp:LanguageVariety>
               <cmdp:languageDependent>yes</cmdp:languageDependent>
               <cmdp:Language>
                  <cmdp:LanguageName>Dutch</cmdp:LanguageName>
                  <cmdp:ISO639>
                     <cmdp:iso-639-3-code>nld</cmdp:iso-639-3-code>
                  </cmdp:ISO639>
               </cmdp:Language>
               <cmdp:Centuries>
					             <cmdp:centuryDependent>yes</cmdp:centuryDependent>
					             <cmdp:CenturyInterval>
					                <cmdp:centuryFrom>20</cmdp:centuryFrom>
					                <cmdp:centuryThrough>21</cmdp:centuryThrough>
					             </cmdp:CenturyInterval>
				           </cmdp:Centuries>
            </cmdp:LanguageVariety>
         </cmdp:SoftwareFunction>
         <cmdp:SoftwareImplementation>
            <cmdp:distributionMedium>Online available</cmdp:distributionMedium>
		          <cmdp:sourcecodeURI>https://github.com/schemreier/oralhistory</cmdp:sourcecodeURI>
            <cmdp:InstallationRequirements>
               <cmdp:MinimumHardwareRequirements>
                  <cmdp:SystemRequirements>
                     <cmdp:workingMemoryMin>not specified</cmdp:workingMemoryMin>
                     <cmdp:hardDiskMin>not specified</cmdp:hardDiskMin>
                     <cmdp:Platform>
                        <cmdp:operatingSystem>POSIX</cmdp:operatingSystem>
						                  <cmdp:operatingSystemVersion>not specified</cmdp:operatingSystemVersion>
						                  <cmdp:bitArchitecture>unknown</cmdp:bitArchitecture>
                     </cmdp:Platform>
                  </cmdp:SystemRequirements>
               </cmdp:MinimumHardwareRequirements>
               <cmdp:SoftwareRequirements>
                  <cmdp:RequiredSoftware>
                     <cmdp:SoftwareShortDescription>
                        <cmdp:resourceName>KALDI</cmdp:resourceName>
						                  <cmdp:version>not specified</cmdp:version>
						                  <cmdp:url>https://github.com/kaldi-asr/kaldi</cmdp:url>
                        <cmdp:applicationType>localDesktop</cmdp:applicationType>
                     </cmdp:SoftwareShortDescription>
                  </cmdp:RequiredSoftware>
               </cmdp:SoftwareRequirements>
            </cmdp:InstallationRequirements>
            <cmdp:UserInterface>
               <cmdp:interfaceType>graphical user interface</cmdp:interfaceType>
               <cmdp:applicationType>web application</cmdp:applicationType>
            </cmdp:UserInterface>
            <cmdp:UserInterface>
               <cmdp:interfaceType>web interface</cmdp:interfaceType>
               <cmdp:applicationType>web service</cmdp:applicationType>
            </cmdp:UserInterface>
		          <cmdp:Input>
			            <cmdp:inputType>audio</cmdp:inputType>
			            <cmdp:inputResource>Daily Conversations</cmdp:inputResource>
		             <cmdp:MimeType>
				              <cmdp:MimeType>audio/wav</cmdp:MimeType>
				              <cmdp:MimeType>audio/ogg</cmdp:MimeType>
				              <cmdp:MimeType>audio/mpeg3</cmdp:MimeType>
			            </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
			            <cmdp:inputType>audio</cmdp:inputType>
			            <cmdp:inputResource>Oral History Interviews</cmdp:inputResource>
		             <cmdp:MimeType>
				              <cmdp:MimeType>audio/wav</cmdp:MimeType>
				              <cmdp:MimeType>audio/ogg</cmdp:MimeType>
				              <cmdp:MimeType>audio/mpeg3</cmdp:MimeType>
			            </cmdp:MimeType>
		          </cmdp:Input>
		          <cmdp:Input>
			            <cmdp:inputType>audio</cmdp:inputType>
			            <cmdp:inputResource>Parliament Talks</cmdp:inputResource>
		             <cmdp:MimeType>
				              <cmdp:MimeType>audio/wav</cmdp:MimeType>
				              <cmdp:MimeType>audio/ogg</cmdp:MimeType>
				              <cmdp:MimeType>audio/mpeg3</cmdp:MimeType>
			            </cmdp:MimeType>
		          </cmdp:Input>
		

		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>.txt file: Transcription</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>TXT</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>Transcription in XML format (same contents as the .ctm file)</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>Audio Doc</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/xml</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>.sent file: attempt to turn the transcription into sentences</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>SENT</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/plain</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>.ctm file: Transcription word by word, with time stamps for start and end, plus  confidence</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>CTM</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/csv</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>.ctm.spk file: Transcription word by word, with start, end, confidence and speaker linking information</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>CTM.SPK</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/csv</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
		          <cmdp:Output>
               <cmdp:outputType>text</cmdp:outputType>
				           <cmdp:characterEncoding>utf8</cmdp:characterEncoding>
					          <cmdp:outputResource>.rttm file: speaker diaeresis information: speaker linking info with timestamps</cmdp:outputResource>
                <cmdp:Schema>
                  <cmdp:schemaname>RTTM</cmdp:schemaname>
               </cmdp:Schema> 
		             <cmdp:MimeType>
                  <cmdp:MimeType>text/csv</cmdp:MimeType>
               </cmdp:MimeType>
                <cmdp:AnnotationType>
					             <cmdp:AnnotationType>Orthography/Token</cmdp:AnnotationType>
				           </cmdp:AnnotationType>
		          </cmdp:Output>
         </cmdp:SoftwareImplementation>
         <cmdp:Access>
            <cmdp:ResourceLicense>
               <cmdp:license>unknown</cmdp:license>
			            <cmdp:version>unknown</cmdp:version>
               <cmdp:distributionType>public</cmdp:distributionType>
               <cmdp:url/>
               <cmdp:Price>
                  <cmdp:amount>0</cmdp:amount>
                  <cmdp:ISO4217>
                     <cmdp:iso-4217-currency>EUR</cmdp:iso-4217-currency>
                  </cmdp:ISO4217>
               </cmdp:Price>
            </cmdp:ResourceLicense>
               <cmdp:Contact>
                  <cmdp:Person>
				  Henk van den Heuvel
                  </cmdp:Person>
				           <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  h.vandenheuvel@let.ru.nl
                  </cmdp:Email>
				           <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>

         </cmdp:Access>
         <cmdp:ResourceDocumentation>
            <cmdp:Documentation>
               <cmdp:title>Iris Hendrickx, Antal van den Bosch, Maarten van Gompel, Ko van der Sloot, Walter Daelemans. 2016.Frog: An  Natural Language Processing suite for Dutch. CLST Technical Report 16-02.</cmdp:title>
               <cmdp:documentationTarget>user</cmdp:documentationTarget>
               <cmdp:url>https://github.com/LanguageMachines/frog/blob/master/docs/frogmanual.pdf</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>Webservice Specification</cmdp:title>
               <cmdp:documentationTarget>user</cmdp:documentationTarget>
               <cmdp:url>https://webservices-lst.science.ru.nl/oral_history/info/</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>readme</cmdp:title>
               <cmdp:documentationTarget>user</cmdp:documentationTarget>
               <cmdp:url>https://github.com/schemreier/oralhistory/blob/master/README.md</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
            <cmdp:Documentation>
               <cmdp:title>issueTracker</cmdp:title>
               <cmdp:documentationTarget>technical</cmdp:documentationTarget>
               <cmdp:url>https://github.com/schemreier/oralhistory/issues</cmdp:url>
               <cmdp:ISO639>
                  <cmdp:iso-639-3-code>eng</cmdp:iso-639-3-code>
               </cmdp:ISO639>
            </cmdp:Documentation>
		
         </cmdp:ResourceDocumentation>
         <cmdp:SoftwareDevelopment>

            <cmdp:Project>
			            <cmdp:name>unknown</cmdp:name>
			            <cmdp:title>unknown</cmdp:title>
			            <cmdp:funder>unknown</cmdp:funder>
			            <cmdp:url/>
			            <cmdp:Contact/>
			            <cmdp:Duration/>
			         </cmdp:Project>
			
            <cmdp:Creator>
   
               <cmdp:Role>project lead</cmdp:Role>
               <cmdp:Contact>
                  <cmdp:Person>
				  Henk van den Heuvel
                  </cmdp:Person>
				              <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  a.vandenbosch@let.ru.nl
                  </cmdp:Email>
				              <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>
            </cmdp:Creator>
            <cmdp:Creator>
               <cmdp:Role>
			   software developer
               </cmdp:Role>
               <cmdp:Contact>
                  <cmdp:Person>
				  Emre Yılmaz
                  </cmdp:Person>
				              <cmdp:Address>Nijmegen, the Netherlands</cmdp:Address>
                  <cmdp:Email>
				  emre.yilmaz@let.ru.nl
                  </cmdp:Email>
				              <cmdp:Department>Center for Language and Speech Technology</cmdp:Department>
                  <cmdp:Organisation>
				  Radboud University Nijmegen
                  </cmdp:Organisation>
                  <cmdp:Url>
					https://www.ru.nl/clst/
                  </cmdp:Url>
               </cmdp:Contact>
            </cmdp:Creator>
  		
         </cmdp:SoftwareDevelopment>
         <cmdp:TechnicalInfo>
            <cmdp:ImplementationLanguage>
               <cmdp:implementationLanguage>unknown</cmdp:implementationLanguage>
               <cmdp:version>unknown</cmdp:version>
            </cmdp:ImplementationLanguage>
         </cmdp:TechnicalInfo>
      </cmdp:ClarinSoftwareDescription>
   </cmd:Components>
</cmd:CMD>
Organisation:
  • Utrecht University
  • Radboud University Nijmegen

Resources:

Resource

audio/wav