His desire was to export this information in a form that it could be treated much more like a proper database - spreadsheet format would be a start. Exporting the Word document to text was easy, but where to go from there? I started toying around with a Perl script to try to clean up the data, and found that there were some convenient modules around for capturing input to a vCard standard format.
The attached script uses the Text::vCard module (which you'll have to install manually from CPAN as it isn't available using PPM) with its Addressbook and Node packages to build up a whole address book from the input file. Although I have made no attempt at elegance, it has a good stab at parsing UK-style and US-style company names and addresses into their constituent fields, provided they are separated by commas. NB there are no contact names, as the original data file didn't have any - this is left as an exercise for the reader!
#!/usr/bin/perl -w
#
# convert a file to address records in VCF (vCard) format
# reads standard input and writes to standard output
#
if ( $#ARGV + 1 != 0 ) {
print STDERR "usage: parse_addresses <inputfile.txt >outputfile.vcf\n";
exit;
}
use Text::vCard::Addressbook;
my $addressbook = new Text::vCard::Addressbook;
# For testing / debugging: load a pre-existing address book
#my $addressbook =
# Text::vCard::Addressbook->new(
# { 'source_file' => 'C:/temp/Text-vCard-2.03/rfc2426.vcf', } );
while ( !eof STDIN ) {
parseEntry($addressbook);
}
print $addressbook->export();
sub parseEntry {
my ($addressbook) = @_;
my $line = <STDIN>;
chomp($line);
# First line should contain organisation name and optionally its address
my ( $org, $addr );
if ( $line =~ m'^\s*([0-9A-Za-z][^:]*)\:?\s*$' ) {
$org = $1;
chomp( $line = <STDIN> );
$addr = $line;
}
elsif ( $line =~ m'^\s*([0-9A-Za-z][^:]*)\:\s*(\S.*)$' ) {
$org = $1;
$addr = $2;
}
else {
# Not recognised
print STDERR "Not recognised start of entry: $line\n";
while ( ( defined $line ) && ( $line !~ '^\s*$' ) ) {
chomp( $line = <STDIN> );
print STDERR "Discarding: $line\n";
}
return undef;
}
# print STDERR "Organisation: $org -- Address: $addr\n";
my $vCard = $addressbook->add_vcard();
$vCard->version('3.0');
$vCard->add_node({ 'node_type' => 'ORG' })->name($org);
my $adr = $vCard->add_node( { 'node_type' => 'ADR' } );
my @unsorted = ();
$addr =~ s/\W*$//; # Remove trailing spaces and full-stops
foreach my $adrField ( split( ',', $addr ) ) {
if ( $adrField =~ m'^\s*([A-Z][A-Za-z ]+\s)?([-0-9]{4,11})\s*([A-Z][A-Za-z]+)?\s*$' ) {
# US state and zip-code
my $state = $1;
chop( $state ) if defined $state;
my $zip_code = $2;
my $country = $3;
if ( defined $state ) { $adr->region($state); }
$adr->post_code($zip_code);
if ( defined $country ) { $adr->country($country); }
} elsif ( $adrField =~ m'^\s*([A-Z][A-Za-z ]+\s)?([A-Z]{1,2}[0-9O]{1,2}[A-Z]? [0-9O]{1,2}[A-Z]{2})\s*([A-Z][A-Za-z ]+)?\s*$' ) {
# UK county and post-code
my $county = $1;
chop ($county) if defined $county;
my $post_code = $2;
my $country = $3;
if ( defined $county ) { $adr->region($county); }
$post_code =~ s/O([0-9][A-Z]?) ([0-9])/0$1 $2/;
$post_code =~ s/O([A-Z])? ([0-9])/0$1 $2/;
$post_code =~ s/ O([0-9])/ O$1/;
$post_code =~ s/ ([0-9])O([A-Z]{2})/ $1O$2/;
$adr->post_code($post_code);
if ( defined $country ) { $adr->country($country); }
} elsif ( $adrField =~ m'P.*BOX\s*(\d+)'i ) {
# Post Office Box
my $po_box = $1;
$adr->po_box($po_box);
} elsif ( $adrField =~ m'^\s*(\d+\s+[A-Za-z0-9 ]+$)' ) {
# House number and street
my $street = $1;
$adr->street($street);
} elsif ( $adrField =~ m'^\s*([A-Z][A-Za-z0-9 ]+$)' ) {
push @unsorted, $1;
}
}
# Retrieve unsorted items in reverse order
if (!defined $adr->country() && $#unsorted > 2) {
my $country = pop @unsorted;
if (defined $country && $country =~ m'[A-Z][A-Za-z ]+') {
$adr->country( $country );
} else {
push @unsorted, $country;
}
}
if (!defined $adr->region() && $#unsorted > 1) {
my $region = pop @unsorted;
if (defined $region && $region =~ m'[A-Z][A-Za-z ]+') {
$adr->region( $region );
} else {
push @unsorted, $region;
}
}
if (!defined $adr->city()) {
my $city = pop @unsorted;
if (defined $city && $city =~ m'[A-Z][A-Za-z ]+') {
$adr->city( $city );
} else {
push @unsorted, $city;
}
}
if (!defined $adr->street()) {
$adr->street( pop @unsorted );
}
if ($#unsorted >= 0) {
$adr->extended ( join (', ', @unsorted) );
}
chomp( $line = <STDIN> );
while ( ( defined $line ) && ( $line !~ '^\s*$' ) ) {
if ( $line =~ m'^\s*(https?\://\S+)'i ) {
$vCard->url($1);
} elsif ( $line =~ m'(www\.\S+)'i ) {
$vCard->url($1);
} elsif ( $line =~ m'(\S+@\S+)\s*$' ) {
my $email = $1;
my $node = $vCard->add_node( { 'node_type' => 'EMAIL' } );
my @types = qw (work internet);
$node->add_types( \@types );
$node->value($email);
} elsif ( $line =~ m'^\s*(\+?[0-9() ]+)$' ) {
my $tel = $1;
my $node = $vCard->add_node( { 'node_type' => 'TEL' } );
my @types = qw (work voice);
$node->add_types( \@types );
$node->value($tel);
} elsif ( $line =~ m'^\s*tel.*\:\s*(\S.*)$'i ) {
my $tel = $1;
my $node = $vCard->add_node( { 'node_type' => 'TEL' } );
my @types = qw (work voice);
$node->add_types( \@types );
$node->value($tel);
} elsif ( $line =~ m'^\s*fax.*\:\s*(\S.*)$'i ) {
my $tel = $1;
my $node = $vCard->add_node( { 'node_type' => 'TEL' } );
my @types = qw (work fax);
$node->add_types( \@types );
$node->value($tel);
} elsif ( $line =~ m'^[^:]*name[^:]*\:\s*(\S.*)$'i ) {
$vCard->fn($1);
} elsif ( $line =~ m'^\s*([^:]+)\:\s*$' ) {
my $noteHead = $1;
chomp( $line = <STDIN> );
if ( $line =~ m'^\s*(https?\://\S+)'i ) {
$vCard->url($1);
} elsif ( $line =~ m'(www\.\S+)'i ) {
$vCard->url($1);
} elsif ( $line =~ m'(\S+@\S+)\s*$' ) {
my $email = $1;
my $node = $vCard->add_node( { 'node_type' => 'EMAIL' } );
my @types = qw (work internet);
$node->add_types( \@types );
$node->value($email);
} elsif ( $line =~ m'^\s*(\+?[0-9() ]+)$' ) {
my $tel = $1;
my $node = $vCard->add_node( { 'node_type' => 'TEL' } );
my @types = qw (work voice);
$node->add_types( \@types );
$node->value($tel);
} else {
$vCard->note("$noteHead: $line");
}
} elsif ( $line =~ m'^\s*([^:]+)\:\s*(\S.*)$' ) {
my $noteHead = $1;
my $note = $2;
$vCard->note("$noteHead: $note");
} else {
print STDERR "Cannot understand entry detail: $line\n";
}
chomp( $line = <STDIN> );
}
}
1; # End.
(Look out for cut-off long lines in the above - copy/paste works far better in Firefox than in Internet Explorer. If you have problems, ask me to e-mail you a copy of the script). The script spits out the resulting vCard file to the standard output. Anything it can't parse is echoed to standard error.
I found a really nifty converter from VCF to CSV at http://labs.brotherli.ch/vcfconvert/ - once you've got a pile of addresses in spreadsheet format, you can do anything with it.
No comments:
Post a Comment