Fragmentation Script

From LMNLWiki

See documentation on the Automatic Fragmentation page.

#!/usr/bin/perl

# This script will create a well-formed XML-file from a series of markers, 
# even if the markers do not represent a well-formed structure. 
# Overlap will be fragmented.
# Copyright 2007 Bert Van Elsacker
# bertve@gmail.com


open (INPUT, $ARGV[0]);
my @lines = <INPUT>;
my $invoer = join("", @lines);

$/='<>';
open (TEXT, "text.data");
my @text = <TEXT>;
my $txt = join("", @text);
my @textfields = split(/\<\>/, $txt);

# This is a prototype version
# It processes markers in this format:
# 
# b1<u b3<u b6<t e1<u e6<t e3<u 
#
# where:
# b/e: beginning/ending
# number: element ID
# <: delimiter
# text after the delimiter: element type, used for determining the priority
#
# If you use XML, obviously you need to pre-process your file.
# The next version of this script will take care of this.
# Also make sure ID's occur in ascending order... (only in the b-markers)
# The file "text.data" will be used to insert text in between the markers. The delimiter for the text fields is "<>".
# We assume there's a text field paired to each marker, containing text or markup which should not be processed
# The text field may be empty
# It follows the marker
# So the original file should start with a marker
# The text fields will be automatically added to the fragmented structure

# This is one of my first Perl programs, the code should be cleaned up and improved to run faster;

# Priority Table
# Hard coded, please adapt for your own needs
# Highest number = highest priority
my %priority = (
    sc => 1,
    i => 1,
    b => 6,
    q => 2,
    p => 3,
    L => 4,
    a => 1,
    c => 2,
    d => 7,
);
my @posE;
my @posB;
my @type;
my @xxx;
# list the markers
@markers = split(/ /, $invoer);
@start = grep(/^b/, @markers);
@end = grep(/^e/, @markers);
for ($i = 0; $i <= $#markers; $i++) {
    $markers[$i] =~ /^(b|e)([0-9]+)<(.+)$/;
        #print "\n" . $1 . " " . $2 . " " . $3;
        $markersID[$i] = $2;
        $markersRole[$i] = $1;
        $markersType[$i] = $3;

        if ($1 eq "b") {
          $posB[$2] = $i;
          $type[$2] = $3;

        }
        else {
        # there must be an element with this ID
          $posE[$2] = $i;


        }


}

# Identify overlap

	
for ($i = 0; $i <= $#posE; $i++) {
	my %overlap = ();
	for ($j=$posB[$i]+1; $j<$posE[$i]; $j++) {
		if ($markersRole[$j] eq "b") {
			$idvalue = $markersID[$j];
			$overlap{$posE[$idvalue]} = $idvalue;
		}
		elsif (exists $overlap{$j}) 	{
                            delete $overlap{$j};
		}
		else {
			$overlap{$j} = $markersID[$j];
		}
	}
	foreach $key (keys %overlap) {
	}
	if (keys %overlap > 0) {
		$xxx[$i] = { %overlap };
		# @xxx is a (discontinuous) array, index = ID, value = hash 
                # hash: key = end point, value = id of the overlapping element
	}
}

# Identify the clusters
$i=0;
# registerd is a hash with  index=id, value=cluster number
#clc is a counter for the cluster numbers
$clc=-1;

@temp=[];

# Start with the first element
# Check if the element has already a cluster number 
# If not, check if an overlapping element more to the right has been attributed a cluster number
#	if so, use this cluster number for the current element
#	if not, use a new cluster number for the current element and overlapping elements

for ($i=0; $i <= @xxx; $i++) {
	if (%{$xxx[$i]}) {
		if (defined $registered[$i]) {
		}
		if (! defined $registered[$i]) {
			$localclc=-1;
			#hash overlapping is a temporary hash with key=end point value=id of all overlapping elements
			%overlapping = %{$xxx[$i]};
			foreach my $od (values %overlapping) {
				if ($localclc < 0) {
					if (defined $registered[$od]) {
						$localclc = $registered[$od];
						$registered[$i] = $localclc;
					}
				}
				else {
					$registered[$od] = $localclc;
				}
			}
			# none of the overlapping elements has a cluster number, create a new cluster number 
                        # and attribute it to the current element and all overlapping elements
			if ($localclc < 0) {
				$clc++;
				$registered[$i] = $clc;
				foreach my $od (values %overlapping) {
					$registered[$od] = $clc;
				}
			}
		}
        }	 
}

#clusters is a hash with index=cluster number, value=array with  id-values

my %clusters;

$c=0;
foreach (@registered) {
	
	if (defined $registered[$c]) {
		push @{ $clusters{$_} }, $c;
	}
	
	$c++;
}
# actual fragmentation starts here

# sort the list of elements in this cluster by priority

foreach $key ( keys %clusters ) {
	
	my %temp = ();
	foreach ( @{$clusters{$key}} ) {
		$temp{$_} = $priority{$type[$_]};
	}
	@twisted = sort {
		# primary subkeys comparison
		$temp{$b} <=> $temp{$a}
				||
		# or if they are equal
		# return secondary comparison
		$a <=> $b
	} keys %temp;
	# fragmentation
	# @twisted contains the id's of all elements in the cluster by descending priority
	# compare each element in the array with the preceding ones
	# and identify the points where the current element should break
	# which are the start and end points of any element with a higher priority than the current element
	
	for ( $i=0; $i<@twisted; $i++ ) {
		%brtemp = ();
		for ($j=0; $j<$i; $j++) {
			$VE1=$twisted[$i];
			$VE2=$twisted[$j];
			if ( $posB[$VE1] < $posB[$VE2] and $posB[$VE2] < $posE[$VE1] ) {
					#overlap
				$brtemp{$posB[$VE2]}="x";
			}
			if ( $posB[$VE1] < $posE[$VE2] and $posE[$VE2] < $posE[$VE1] )	{
				#overlap
				$brtemp{$posE[$VE2]}="x";
			}
		}
		
		# add the boundaries of the current element to the list of fixed boundaries
		foreach (keys %brtemp) {
			push @{ $brokenHere{$_} }, $twisted[$i];
			
		}
	}
}

#print the fragmented structure
for ($i = 0; $i <= $#markers; $i++) {
	
	#first print prepended closing tags
	if (defined $brokenHere{$i}) {
		
		for ( reverse @{ $brokenHere{$i} } ) {
			print "</$type[$_]>"; 
			
			
		}
	}
	
	
	#print the original marker
	if ($markersRole[$i] eq "b") {
		print "<$markersType[$i] xid=\"$markersID[$i]\" orig=\"y\">";
	}
	else {
		print "</$markersType[$i]>";
	}
	
	#print newmade opening tags
	if (defined $brokenHere{$i}) {
		
		for ( @{ $brokenHere{$i} } ) {
			print "<$type[$_] xid=\"$_\">"; 
			
			
		}
	}
	
	#print the text of the original file
	print $textfields[$i];
}