#!/usr/bin/env perl
#
# TODO:
#  - avoid reloading dictionary ... solved
#  - avoid system calls ........... solved
#  - avoid file system access ..... mostly solved
#

use strict;
use FindBin qw($Bin);
use lib "$Bin/lib";

use File::Basename;
use Getopt::Std;
use Time::Local;
use Text::SRT::Align qw/:all/;

$Text::SRT::Align::StoreXML = 1;

=head1 NAME

subalign - script for aligning the OpenSubtitlesXXXX corpora

=head1 SYNOPSIS

 subalign [OPTIONS] <srcdir> <trgdir>

C<srcdir> and C<trgdir> are directories in the subtitle corpus 
from the source and the target language. The script creates a
corresponding sub-dir for the aligned data. For example

 subalign en/2001/209475 et/2001/209475

aligns files in the English and Estonian collection of subtitles
for a movie from 2001 with the ID 209475. The resulting files will
be created in C<en-et/2001/209475>.

=head1 OPTIONS

Command line arguments for subalign:

 -A ................... store alternative alignments in outdir/alt
 -a accept-threshold .. accept alternative subtitle pairs > score 
                        (default=0.75)
 -D duration-thr ...... min duration similarity (default=0.8)
 -M max ............... max nr of subtitle file pairs to try
 -L ................... skip symbolic links (when looking for files)
 -x score-threshold ... threshold for overlap + metascore 
                        (before aligning, default = 0.2)

Command line arguments related to srt-alignment

 -S source-lang . source language ID
 -T target-lang . target language ID
 -c score ....... use cognates with LCSR>=score
 -r score-range . use cognates in a certain range 1..score and take best
 -l length ...... set minimal length of cognates (if used)
 -i len ......... use identical strings with length>=len
 -w size ........ set size for sliding window
 -d dic ......... use dictionary in file 'dic'
 -u ............. cognates/identicals that start with upper case only
 -r char_set .... define a set of characters to be used for matching
 -q ............. normalize length scores with (current) word frequencies
 -b ............. use "best" alignment (least empty alignments)
 -p nr .......... stop after <nr> candidates (when using -b)
 -m MAX ......... in "best" alignment: use only MAX first & MAX last
                  (default = 10; 0 = all)
 -f uplug-conf .. use fallback aligner if necessary
 -v ............. verbose output

=head1 DESCRIPTION

subalign looks at pairs of movie subtitle files in the OpenSubtitle corpora
and tries to find the best pair that aligns with the least empty translation units
among all alternative subtitle files. It uses a score that combines the proportion
of non-empty alignment units and a score based on metadata. The latter requires 
meta-information stored in the subtitle files (which is available from OpenSubtitles2016)

=cut


use vars qw/$opt_A $opt_a $opt_b $opt_c $opt_D $opt_d
            $opt_f $opt_h $opt_I $opt_i $opt_L $opt_l
            $opt_M $opt_m $opt_O $opt_p
            $opt_q $opt_r $opt_S $opt_s $opt_T $opt_t
            $opt_u $opt_v $opt_w $opt_x/;

getopts('Aa:bc:D:d:f:h:Ii:Ll:M:m:O:p:qr:S:s:T:t:uvw:x:');


# TODO ---> SOLVED
# common parameters that need to be resolved!
#-------------------------------------------------
#     srtalign        subalign                   changed in subalign
# d - dic             min duration time          --> -D
# l - minlen          no function?               --> delete
# t - toklen          min score                  --> -x (mixed/combined score)
# r - cognate range   align paras (can skip now) --> delete
# m - max matches     max subtitle pairs to try  --> -M



# should sum to one
my %ScoreWeight = ( utf8         => 0.01,   # 2x
		    recency      => 0.01,   # 2x
		    ignored      => 0.02,   # 2x
		    corrected    => 0.02,   # 2x
		    rating       => 0.05,   # 2x
		    langid       => 0.1,    # 2x
		    durationdiff => 0.1,
		    overlap      => 0.5 );
		    

# source and target language directories with one more sutitle files for the 
# same movie

#my $srcdir=shift || 'en/2009/47645';
#my $trgdir=shift || 'de/2009/47645';

my $srcdir=shift || 'en/2001/209475';
my $trgdir=shift || 'et/2001/209475';
my $outdir=shift;

# some more parameters:
# - alignment program
# - MinScore = minimum link-type ratio (ratio between empty and non-empty)
# - TrySync = score threshold for trying subtitle synchronization

my $SrtAlign = `which srtalign`;
chomp($SrtAlign);
$SrtAlign .= ' -P '. $opt_p;

# my $SrtAlign = 'srtalign -P '. $opt_p;

# my $SrtAlign = $Bin.
#    '/../../../tools/public/uplug-old/tools/subtitles/srtalign.pl '. $opt_p;

my $MinScore      = $opt_x || 0.2;   # min overlap score
my $MinAlignScore = $opt_a || 0.75;  # min align score (empty-align proportion)
my $MinOverlap    = $opt_O || 0.8;   # min time overlap

my $MinDurationSim = $opt_D || 0.8; # max relative similarity in duration

# make outdir if not given

if (! $outdir){
    my @SrcPath = split(/\//,$srcdir);
    my @TrgPath = split(/\//,$trgdir);

    $TrgPath[-3] = join('-',sort ($SrcPath[-3],$TrgPath[-3]));
    $outdir = join('/',@TrgPath);
}


# grep all sub-files in source and target language dir

opendir(my $dh, $srcdir) || die "can't opendir $srcdir: $!";
my @SrcSubs = grep { /\.xml.gz$/ && -f "$srcdir/$_" } readdir($dh);
closedir $dh;

opendir(my $dh, $trgdir) || die "can't opendir $trgdir: $!";
my @TrgSubs = grep { /\.xml.gz$/ && -f "$trgdir/$_" } readdir($dh);
closedir $dh;



## new: all combinations are possible!!

my %TimeFrames=();
my %MetaData=();
my %Overlap=();

foreach my $src (@SrcSubs){

    ## check if the source file is just a symbolic link
    my $SymbolicLink = 0;
    $SymbolicLink = 1 if (-l $srcdir.'/'.$src);

    foreach my $trg (@TrgSubs){

	## skip sub-pair if both files are symbolic links
	## and option -L is set
	if ($opt_L && $SymbolicLink){
	    next if (-l $trgdir.'/'.$trg);
	}

	if (!exists $TimeFrames{$src}){
	    $TimeFrames{$src} = [];
	    $MetaData{$src} = {};
	    read_time_frames($srcdir.'/'.$src,
			     $TimeFrames{$src},
			     $MetaData{$src});
	}
	if (!exists $TimeFrames{$trg}){
	    $TimeFrames{$trg} = [];
	    $MetaData{$trg} = {};
	    read_time_frames($trgdir.'/'.$trg,
			     $TimeFrames{$trg},
			     $MetaData{$trg});
	}
	$Overlap{"$src:$trg"} = time_overlap_ratio($TimeFrames{$src},
						   $TimeFrames{$trg});
    }
}

my %CandidateScores=();
my %CombinedScore=();
make_candidate_scores(\%Overlap,\%CandidateScores,\%CombinedScore,
		      \%MetaData);

print STDERR "\n";
#next unless (keys %Overlap);


if (keys %Overlap){
    best_align(\%CombinedScore,
	       $MinScore,
	       \%Overlap,
	       \%CandidateScores,
	       $opt_M);                    # stop after x pairs
}



sub make_candidate_scores{
    my $overlap=shift;
    my $scores=shift;
    my $combined=shift;
    my $meta=shift;

    my $first_date=0;
    my $last_date=0;
    foreach (keys %{$meta}){
	if ($$meta{$_}{upload_date}>$last_date){
	    $last_date = $$meta{$_}{upload_date};
	}
	$first_date = $last_date unless ($first_date);
	if ($$meta{$_}{upload_date}<$first_date){
	    $first_date = $$meta{$_}{upload_date};
	}
    }
    my $period = $last_date - $first_date;
    foreach (keys %{$meta}){
	if ($period && $first_date && $last_date){
	    $$meta{$_}{recency} = ($$meta{$_}{upload_date}-$first_date)/$period;
	}
	else{
	    $$meta{$_}{recency} = 1;
	}
	if ($$meta{$_}{encoding} eq 'utf-8'){
	    $$meta{$_}{utf8}=1;
	}
    }

# add factors to the scoring function

    foreach my $pair (keys %{$overlap}){
	my ($src,$trg) = split(/:/,$pair);
	$$scores{$pair} = $ScoreWeight{utf8} * $$meta{$src}{utf8};
	$$scores{$pair} += $ScoreWeight{utf8} * $$meta{$trg}{utf8};
	$$scores{$pair} += $ScoreWeight{recency} * $$meta{$src}{recency};
	$$scores{$pair} += $ScoreWeight{recency} * $$meta{$trg}{recency};

	if ($$meta{$src}{blocks}>0){
	    my $ok = 1-$$meta{$src}{ignored_blocks}/$$meta{$src}{blocks};
	    $$scores{$pair} += $ScoreWeight{ignored} * $ok ** 2;
	}
	if ($$meta{$trg}{blocks}>0){
	    my $ok = 1-$$meta{$trg}{ignored_blocks}/$$meta{$trg}{blocks};
	    $$scores{$pair} += $ScoreWeight{ignored} * $ok ** 2;
	}

	if (($$meta{$src}{duration_seconds}>0) || 
	    ($$meta{$trg}{duration_seconds}>0)){
	    if ($$meta{$src}{duration_seconds}>$$meta{$trg}{duration_seconds}){
		$$scores{"$pair:durationdiff"} =
		    1- ($$meta{$src}{duration_seconds}-
			$$meta{$trg}{duration_seconds}) / 
			$$meta{$src}{duration_seconds};
	    }
	    else{
		$$scores{"$pair:durationdiff"} =
		    1- ($$meta{$trg}{duration_seconds}-
			$$meta{$src}{duration_seconds}) / 
			$$meta{$trg}{duration_seconds};
	    }
	    $$scores{$pair} += $ScoreWeight{durationdiff} * 
		$$scores{"$pair:durationdiff"} ** 2;

	}
	if ($$meta{$src}{tokens}){
	    my $correct = 1-$$meta{$src}{corrected_words}/$$meta{$src}{tokens};
	    $$scores{$pair} += $ScoreWeight{corrected} * $correct ** 2;
	}
	if ($$meta{$trg}{tokens}){
	    my $correct = 1-$$meta{$trg}{corrected_words}/$$meta{$trg}{tokens};
	    $$scores{$pair} += $ScoreWeight{corrected} * $correct ** 2;
	}

	$$scores{$pair} += $ScoreWeight{langid} * $$meta{$src}{confidence};
	$$scores{$pair} += $ScoreWeight{langid} * $$meta{$trg}{confidence};

	## rating should only influence the score if it is set (>0)
	if ($$meta{$src}{rating}>0){
	    $$scores{$pair} += $ScoreWeight{rating} * $$meta{$src}{rating};
	}
	else{
	    $$scores{$pair} /= 1-$ScoreWeight{rating};
	}
	if ($$meta{$trg}{rating}>0){
	    $$scores{$pair} += $ScoreWeight{rating} * $$meta{$trg}{rating};
	}
	else{
	    $$scores{$pair} /= 1-$ScoreWeight{rating};
	}

	$$combined{$pair} = 
	    $ScoreWeight{overlap} * $$overlap{$pair} + $$scores{$pair};
    }
}


# %scores      = hash of time overlap ratios
# $threshold   = minimum time overlap ratio
# type         = subtitle type ('1of1' etc)
# realign_para = extra parameters to re-align subtitles (synchronization ...)
# align_para   = general alignment parameters (not combined with realign_para!)
# max          = maximum number of subtitle pairs to try

sub best_align{
    my $scores = shift;
    my $threshold = shift || 0.7;
    my $overlap = shift;
    my $metascore = shift;

    ## optional parameter
    my ($max) = @_;

    my @sorted = sort {$$scores{$b} <=> $$scores{$a}} keys %{$scores};

    if ($$scores{$sorted[0]}<$threshold){
	print STDERR "max score $$scores{$sorted[0]} below threshold!\n";
	return 0;
    }

    my $count        = 0;
    my %alignments   = ();
    my %alignedfiles = ();

    my %alignScore      = ();
    my %alignOverlap    = ();
    my %alignTotalScore = ();

    foreach my $p (@sorted){
	$count++;
	last if ($max and ($count > $max));
	my ($src,$trg) = split(/:/,$p);

	# min score threshold
	last if $$scores{$p}<$threshold;

	# skip subtitles that differ significantly in length
	next if ($$metascore{"$p:durationdiff"} < $MinDurationSim);

	my ($SrcPart)=split(/\./,$src);
	my ($TrgPart)=split(/\./,$trg);
	my $algfile = $SrcPart.'-'.$TrgPart;

	# align the best candidate 
	# but accept only of the link score is good enough
	# otherwise: continue with other candidates

	## hash of all alignments
	@{$alignments{$algfile}} = ();
	$alignedfiles{$algfile}[0] = "$srcdir/$src";
	$alignedfiles{$algfile}[1] = "$trgdir/$trg";

	## TODO: it does not make sense to re-read the same files again and again
	## --> change parse_bitext in Align.pm to remember the parsed structure
	## --> check Clone from Cpan: http://search.cpan.org/~garu/Clone-0.39/Clone.pm
	##     (faster than Storable: https://stackoverflow.com/questions/1546322/whats-the-best-way-to-deep-copy-a-hash-of-hashes-in-perl)
	## --> should be optiona to keep data in memory --> could get big
	##
	## ----> SOLVED: set $Text::SRT::Align::StoreXML to store bitexts and retrieve parsed data

	my ($score,$overlap) = 
	    subalign("$srcdir/$src","$trgdir/$trg",$alignments{$algfile});

	$alignScore{$algfile} = $score;
	$alignOverlap{$algfile} = $overlap;
	$alignTotalScore{$algfile} = $ScoreWeight{overlap} * $score + $$metascore{$p};

	print STDERR "align:\t$algfile \t$overlap\t$score\t$alignTotalScore{$algfile}\n";
    }

    my ($best) = sort { $alignTotalScore{$b} <=> $alignTotalScore{$a} } keys %alignTotalScore;
    if ($alignTotalScore{$best} > $MinAlignScore){
	system("mkdir -p $outdir") unless (-d $outdir);
	if (open ( my $fh,"| gzip -c > $outdir/$best.xml.gz" ) ){
	    print "best align:\t$outdir/$best.xml.gz\t$alignOverlap{$best}\t$alignScore{$best}\t\t$alignTotalScore{$best}\n";
	    &print_ces($alignedfiles{$best}[0],
		       $alignedfiles{$best}[1],
		       $alignments{$best},
		       { 
			   overlap => $alignOverlap{$best},
			   algscore => $alignScore{$best}, 
			   score => $alignTotalScore{$best} 
		       },
		       $fh);
	    delete $alignTotalScore{$best};
	}
    }
    if ($opt_A){
	if (keys %alignTotalScore){
	    system("mkdir -p $outdir/alt") unless (-d "$outdir/alt");
	    foreach my $a (keys %alignTotalScore){

		## TODO: should we have this or do we want all alternative alignments?
		if ($alignTotalScore{$a} > $MinAlignScore){
		    if (open ( my $fh,"| gzip -c > $outdir/alt/$a.xml.gz" ) ){
			print "alternative:\t$outdir/$a.xml.gz\t$alignOverlap{$a}\t$alignScore{$a}\t\t$alignTotalScore{$a}\n";
			&print_ces($alignedfiles{$a}[0],
				   $alignedfiles{$a}[1],
				   $alignments{$a},
				   { 
				       overlap => $alignOverlap{$a},
				       algscore => $alignScore{$a}, 
				       score => $alignTotalScore{$a} 
				   },
				   $fh);
		    }
		}
	    }
	}
    }
    return $alignTotalScore{$best};
}


sub subalign{
    my ($src,$trg,$alignments)=@_;

    my ($score,$overlap) = 
	&align($src,$trg,$alignments,
	   SOURCE_LANG => $opt_S,   # setting source and target language enables
	   TARGET_LANG => $opt_T,   # dic-lookup & also sets BEST_ALIGN (!!)
	   SCORE_PROPORTION => 1,   # always use proportional score
	   FALLBACK => $opt_f,
	   VERBOSE => $opt_v,
	   BEST_ALIGN => $opt_b,
	   USE_WORDFREQ => $opt_q,
	   USE_IDENTICAL => $opt_i,  # use cognate filter (identical words)
	   CHAR_SET => $opt_s,
	   TOK_LEN => $opt_t,
	   MAX_MATCHES => $opt_m,
	   USE_COGNATES => $opt_c,   # use cognate filter (lcsr)
	   COGNATE_RANGE => $opt_r,  # use cognate fillter (1..score)
	   USE_DICTIONARY => $opt_d, # use dictionary filter
	   UPPER_CASE => $opt_u,     # cognate filter with upper case words only
	   MINLENGTH => $opt_l,      # minimum token length (default=5)
	   WINDOW => $opt_w);        # window for lexical matching (default=25)

    return wantarray ? ($score,$overlap) : $score;
}



sub time_overlap_ratio{
    my ($frames1,$frames2)=@_;
    my $common=0;
    my $diff=0;

    my @time1=@{$frames1};
    my @time2=@{$frames2};

    my $start1=shift(@time1);
    my $end1=shift(@time1);

    my $start2=shift(@time2);
    my $end2=shift(@time2);

    ## TODO: should we skip extra frames in the beginning?
    ## (need to do do that here in that case)

    while ($end1 && $end2){

	# sub1 frame is completely before sub2 frame
	if ($end1 < $start2){
	    $diff+=($end1-$start1);
	    $start1=shift(@time1);
	    $end1=shift(@time1);
	    next;
	}
	# sub2 frame is completely before sub1 frame
	if ($end2 && ($end2 < $start1)){
	    $diff+=($end2-$start2);
	    $start2=shift(@time2);
	    $end2=shift(@time2);
	    next;
	}

	my $CommonStart;
	# sub1 frame starts before sub2 frame
	if ($start1 < $start2){
	    $diff+=($start2-$start1);
	    $CommonStart=$start2;
	}
	# sub2 frame starts before sub1 frame
	else{
	    $diff+=($start1-$start2);
	    $CommonStart=$start1;
	}

        # sub1 frame ends before sub2
	if ($end1 < $end2){
	    $common+=($end1-$CommonStart);
	    $start2=$end1;         # move start2 to end of sub1 frame
	    $start1=shift(@time1);
	    $end1=shift(@time1);
	}
	# sub2 frame ends before sub1
	else{
	    $common+=($end2-$CommonStart);
	    $start1=$end2;         # move start1 to end of sub2 frame
	    $start2=shift(@time2);
	    $end2=shift(@time2);
	}
    }

    ## TODO: should we skip extra frames at the end?

    # remaining sub2 frames
    if (!$start1){
	while ($start2 && $end2){
	    $diff+=($end2-$start2);
	    $start2=shift(@time2);
	    $end2=shift(@time2);
	}
    }

    # remaining sub1 frames
    if (!$start2){
	while ($start1 && $end1){
	    $diff+=($end1-$start1);
	    $start1=shift(@time1);
	    $end1=shift(@time1);
	}
    }
    if ($common || $diff){
	return $common/($common+$diff);
    }
    return 0;
}



## TODO: should this be done in Align.pm? (together with parse_bitext)

sub read_time_frames{
    my $file=shift;
    my $time=shift;
    my $meta=shift;

    if ($file=~/\.gz$/){
	open F,"gzip -cd < $file |" || die "cannot open $file!\n";
    }
    else{
	open F, "<$file" || die "cannot open $file!\n";
    }
    while (<F>){
	if (/\<time.*value\=\"(.*?)\"/){
	    my $sec = time2sec($1);
	    ## sanityt check: time should always increase!
	    if ((!@{$time}) || ($$time[-1]<$sec)){
		push(@$time,time2sec($1));
	    }
	}
	if (/<meta>/){
	    my $block = '';
	    while (<F>){
		if (/\<(.*?)>(.*)<\//){
		    my $tag = $1;
		    if ($tag eq 'duration'){
			if ($block ne 'subtitle'){
			    $tag = $block.'-'.$tag;
			} 
		    }
		    $$meta{$tag}=$2;
		}
		elsif (/\<(.*?)>/){
		    $block = $1;
		}
		last if (/<\/meta>/);
	    }
	    if (exists $$meta{date}){
		my ($yy,$mm,$dd) = split(/-/,$$meta{date});
		$$meta{upload_date} = timelocal(0,0,0,$dd,$mm-1,$yy);
	    }
	    if (exists $$meta{duration}){
		my ($hh,$mm,$ss) = split(/:/,$$meta{duration});
		$$meta{duration_seconds} = $ss+60*$mm+3600*$hh;
	    }
	}
    }
    close F;
    print STDERR ".";
}





sub time2sec{
    my $time=shift;
    my ($h,$m,$s,$ms)=split(/[^0-9\-]/,$time);
    my $sec = 3600*$h+60*$m+$s+$ms/1000;
    return $sec;
}
