Fragmentation Script
From LMNLWiki
See documentation on the Automatic Fragmentation page.
#!/usr/bin/perl
# This script will create a well-formed XML-file from a series of markers,
# even if the markers do not represent a well-formed structure.
# Overlap will be fragmented.
# Copyright 2007 Bert Van Elsacker
# bertve@gmail.com
open (INPUT, $ARGV[0]);
my @lines = <INPUT>;
my $invoer = join("", @lines);
$/='<>';
open (TEXT, "text.data");
my @text = <TEXT>;
my $txt = join("", @text);
my @textfields = split(/\<\>/, $txt);
# This is a prototype version
# It processes markers in this format:
#
# b1<u b3<u b6<t e1<u e6<t e3<u
#
# where:
# b/e: beginning/ending
# number: element ID
# <: delimiter
# text after the delimiter: element type, used for determining the priority
#
# If you use XML, obviously you need to pre-process your file.
# The next version of this script will take care of this.
# Also make sure ID's occur in ascending order... (only in the b-markers)
# The file "text.data" will be used to insert text in between the markers. The delimiter for the text fields is "<>".
# We assume there's a text field paired to each marker, containing text or markup which should not be processed
# The text field may be empty
# It follows the marker
# So the original file should start with a marker
# The text fields will be automatically added to the fragmented structure
# This is one of my first Perl programs, the code should be cleaned up and improved to run faster;
# Priority Table
# Hard coded, please adapt for your own needs
# Highest number = highest priority
my %priority = (
sc => 1,
i => 1,
b => 6,
q => 2,
p => 3,
L => 4,
a => 1,
c => 2,
d => 7,
);
my @posE;
my @posB;
my @type;
my @xxx;
# list the markers
@markers = split(/ /, $invoer);
@start = grep(/^b/, @markers);
@end = grep(/^e/, @markers);
for ($i = 0; $i <= $#markers; $i++) {
$markers[$i] =~ /^(b|e)([0-9]+)<(.+)$/;
#print "\n" . $1 . " " . $2 . " " . $3;
$markersID[$i] = $2;
$markersRole[$i] = $1;
$markersType[$i] = $3;
if ($1 eq "b") {
$posB[$2] = $i;
$type[$2] = $3;
}
else {
# there must be an element with this ID
$posE[$2] = $i;
}
}
# Identify overlap
for ($i = 0; $i <= $#posE; $i++) {
my %overlap = ();
for ($j=$posB[$i]+1; $j<$posE[$i]; $j++) {
if ($markersRole[$j] eq "b") {
$idvalue = $markersID[$j];
$overlap{$posE[$idvalue]} = $idvalue;
}
elsif (exists $overlap{$j}) {
delete $overlap{$j};
}
else {
$overlap{$j} = $markersID[$j];
}
}
foreach $key (keys %overlap) {
}
if (keys %overlap > 0) {
$xxx[$i] = { %overlap };
# @xxx is a (discontinuous) array, index = ID, value = hash
# hash: key = end point, value = id of the overlapping element
}
}
# Identify the clusters
$i=0;
# registerd is a hash with index=id, value=cluster number
#clc is a counter for the cluster numbers
$clc=-1;
@temp=[];
# Start with the first element
# Check if the element has already a cluster number
# If not, check if an overlapping element more to the right has been attributed a cluster number
# if so, use this cluster number for the current element
# if not, use a new cluster number for the current element and overlapping elements
for ($i=0; $i <= @xxx; $i++) {
if (%{$xxx[$i]}) {
if (defined $registered[$i]) {
}
if (! defined $registered[$i]) {
$localclc=-1;
#hash overlapping is a temporary hash with key=end point value=id of all overlapping elements
%overlapping = %{$xxx[$i]};
foreach my $od (values %overlapping) {
if ($localclc < 0) {
if (defined $registered[$od]) {
$localclc = $registered[$od];
$registered[$i] = $localclc;
}
}
else {
$registered[$od] = $localclc;
}
}
# none of the overlapping elements has a cluster number, create a new cluster number
# and attribute it to the current element and all overlapping elements
if ($localclc < 0) {
$clc++;
$registered[$i] = $clc;
foreach my $od (values %overlapping) {
$registered[$od] = $clc;
}
}
}
}
}
#clusters is a hash with index=cluster number, value=array with id-values
my %clusters;
$c=0;
foreach (@registered) {
if (defined $registered[$c]) {
push @{ $clusters{$_} }, $c;
}
$c++;
}
# actual fragmentation starts here
# sort the list of elements in this cluster by priority
foreach $key ( keys %clusters ) {
my %temp = ();
foreach ( @{$clusters{$key}} ) {
$temp{$_} = $priority{$type[$_]};
}
@twisted = sort {
# primary subkeys comparison
$temp{$b} <=> $temp{$a}
||
# or if they are equal
# return secondary comparison
$a <=> $b
} keys %temp;
# fragmentation
# @twisted contains the id's of all elements in the cluster by descending priority
# compare each element in the array with the preceding ones
# and identify the points where the current element should break
# which are the start and end points of any element with a higher priority than the current element
for ( $i=0; $i<@twisted; $i++ ) {
%brtemp = ();
for ($j=0; $j<$i; $j++) {
$VE1=$twisted[$i];
$VE2=$twisted[$j];
if ( $posB[$VE1] < $posB[$VE2] and $posB[$VE2] < $posE[$VE1] ) {
#overlap
$brtemp{$posB[$VE2]}="x";
}
if ( $posB[$VE1] < $posE[$VE2] and $posE[$VE2] < $posE[$VE1] ) {
#overlap
$brtemp{$posE[$VE2]}="x";
}
}
# add the boundaries of the current element to the list of fixed boundaries
foreach (keys %brtemp) {
push @{ $brokenHere{$_} }, $twisted[$i];
}
}
}
#print the fragmented structure
for ($i = 0; $i <= $#markers; $i++) {
#first print prepended closing tags
if (defined $brokenHere{$i}) {
for ( reverse @{ $brokenHere{$i} } ) {
print "</$type[$_]>";
}
}
#print the original marker
if ($markersRole[$i] eq "b") {
print "<$markersType[$i] xid=\"$markersID[$i]\" orig=\"y\">";
}
else {
print "</$markersType[$i]>";
}
#print newmade opening tags
if (defined $brokenHere{$i}) {
for ( @{ $brokenHere{$i} } ) {
print "<$type[$_] xid=\"$_\">";
}
}
#print the text of the original file
print $textfields[$i];
}
