#!/usr/bin/perl # This perl script converts czeng-export-format files into CoNLL format. # See http://ufal.mff.cuni.cz/czeng/czeng09/ for the description of CzEng (Czech-English parallel corpus). # Author: David Marecek (marecek@ufal.mff.cuni.cz) # Date: February 4, 2011 # version 1.0 use strict; use warnings; if (@ARGV < 3) { print STDERR "\nUsage: ./czeng_to_conll.pl czeng_export_format_input_file cs_conll_output_file en_conll_output_file\n\n"; exit; } my $czeng_export_file = $ARGV[0]; my $cs_conll_file = $ARGV[1]; my $en_conll_file = $ARGV[2]; open (IN, "<:utf8", $czeng_export_file) or die; open (CS, ">:utf8", $cs_conll_file) or die; open (EN, ">:utf8", $en_conll_file) or die; while () { chomp; my @trees = split(/\t/, $_); foreach my $en_node (split(/\s/, $trees[1])) { my ($form, $lemma, $tag, $ord, $parent, $afun) = split(/\|/, $en_node); my $pos = $tag; $pos =~ s/^(..).+$/$1/; print EN "$ord\t$form\t$lemma\t$pos\t$tag\t_\t$parent\t$afun\t_\t_\n"; } print EN "\n"; foreach my $cs_node (split(/\s/, $trees[5])) { my ($form, $lemma, $tag, $ord, $parent, $afun) = split(/\|/, $cs_node); my $pos = $tag; $pos =~ s/^(.).+$/$1/; print CS "$ord\t$form\t$lemma\t$pos\t$tag\t_\t$parent\t$afun\t_\t_\n"; } print CS "\n"; } close EN; close CS; close IN;