#!/usr/local/bin/perl -w # # pgrdf2marc.pl converts one or more items from the Project Gutenberg RDF # catalog into MARC format record(s). # # Detailed POD-style documentation is at the end of this file. use strict; use Getopt::Long; use Fcntl; use GDBM_File; #----------------------------------------------------------------------- # Configurables: # Organisation code my $org = 'PGUSA'; my $publisher = 'Project Gutenberg'; my $marcdbpath = '/home/gbnewby/pgiso/marc/marc.db'; my $newpath = '/home/gbnewby/pgiso/marc'; my $chgpath = '/home/gbnewby/pgiso/marc'; #----------------------------------------------------------------------- # NON-Configurables: # ISO 639 Language Codes # populate a hash mapping 639-1 (2-letter) codes to 639-2 (3-letter) codes my %map639 = qw( ab abk aa aar af afr sq alb am amh ar ara hy arm as asm ay aym az aze ba bak eu baq bn ben bh bih bi bis be bre bg bul my bur be bel ca cat zh chi co cos hr scr cs cze da dan nl dut dz dzo en eng eo epo et est fo fao fj fij fi fin fr fre fy fry gl glg ka geo de ger el gre kl kal gn grn gu guj ha hau he heb hi hin hu hun is ice id ind ia ina iu iku ik ipk ga gle it ita ja jpn jv jav kn kan ks kas kk kaz km khm rw kin ky kir ko kor ku kur oc oci lo lao la lat lv lav ln lin lt lit mk mac mg mlg ms may ml mlt mi mao mr mar mo mol mn mon na nau ne nep no nor or ori om orm pa pan fa per pl pol pt por ps pus qu que rm roh ro rum rn run ru rus sm smo sg sag sa san sr scc sh scr sn sna sd snd si sin ss ssw sk slo sl slv so som st sot es spa su sun sw swa sv swe tl tgl tg tgk ta tam tt tat te tel th tha bo tib ti tir to tog ts tso tn tsn tr tur tk tuk tw twi ug uig uk ukr ur urd uz uzb vi vie vo vol cy wel wo wol xh xho yi yid yo yor za zha zu zul ); # input record separator is blank line $/ = "\n\n"; # %cat is a temporary hash for the data extracted from RDF for each item. my %cat; my %options = ( debug => 0 ); GetOptions(\%options, 'debug!', 'help'); if ($options{'help'}) { print qq! pgrdf2marc.pl converts the Project Gutenberg RDF/XML format catalog into MARC21 format records. RDF is read from STDIN, and MARC output to STDOUT. Usage: pgrdf2marc.pl [ --help | --debug ] --debug will dump the RDF and the MARC in text form --help prints this message !; exit; } my %MARCDB; # dbmopen %MARCDB, $marcdbpath, 0644 or die "Cannot open MARCDB file: $!\n"; tie %MARCDB, 'GDBM_File', $marcdbpath, O_CREAT|O_RDWR, 0644; my $today = today(); open CHANGE, ">$chgpath/$today.mrc" or die "Cannot open CHANGE file: $!\n"; open NEW, ">$newpath/$today.mrc" or die "Cannot open NEW file: $!\n"; #----------------------------------------------------------------------- my $rdfrecs = 0; my $marcrecs = 0; my $changes = 0; my $newrecs = 0; while (<>) { next unless /rdf:ID="etext(\d+)"/; $rdfrecs++; if ($options{'debug'}) { # dump the RDF # print "$_\n"; } tr/\r/ /; # remove CR (if any) # convert commonly used character entities s/ & / and /sg; s/&/&/sg; s/—/--/sg; if (&parse_rdf($_)) { my @trec = &build_trec(); if ($options{'debug'}) { # dump the MARC (pretty-printed) foreach (@trec) { print "$_\n"; } print "\n"; } else { my $marc = &array2marc(@trec); my $id = $cat{id}; # compare with existing record for change file my $old = $MARCDB{$id}; $old = '' unless $old; my $new = $marc; # strip everything before 008 data ... (yes, horrible kludge!) $old =~ s/.*?PGUSA.\d{14}\.0//; $new =~ s/.*?PGUSA.\d{14}\.0//; # if the record has changed, save it and write a copy to the CHANGE file unless ( $new eq $old ) { $MARCDB{$id} = $marc; if ($old) { print CHANGE $marc; $changes++; } else { print NEW $marc; $newrecs++; } } } $marcrecs++; } } # now write out the MARC records to files, in bundles of 1000 my $outlimit = 0; foreach my $id ( sort {$a <=> $b} keys %MARCDB ) { if ( $id > $outlimit ) { close OUT if $outlimit; $outlimit += 1000; my $outfile = sprintf "marc/%06d.mrc", $outlimit; open OUT, ">$outfile" or die "Cannot open $outfile, $!\n"; } print OUT $MARCDB{$id}; } close OUT; close CHANGE; #dbmclose %MARCDB; untie %MARCDB; print STDERR qq| $rdfrecs RDF records processed $marcrecs MARC records written $changes changed MARC records $newrecs new MARC records |; exit(0); #----------------------------------------------------------------------- sub parse_rdf { # parse an rdf entry and store the data in our catalogue hash # clear %cat; %cat = (); $cat{title} = []; $cat{created} = '||||-||-||'; # record must have an id ... if (/rdf:ID="etext(\d+)"/) { $cat{id} = $1; } else { return 0; } # ... and a title ... unless (/(.*?)<\/dc:title>/s) { print STDERR "$cat{id} has no title!\n"; return 0; } while (s/(.*?)<\/dc:title>//s) { $cat{title} = [ @{ $cat{title} }, &split_field($1) ]; } if (/(.*)<\/rdf:value>/) { $cat{created} = $1; } #if (/(.*)<\/rdf:value>/) { if (m!(.+)!) { $cat{language} = $1; } if (/(.*)<\/rdf:value>/) { $cat{type} = $1; } if (/(.*)<\/dc:rights>/) { $cat{rights} = $1; } if (/(.*)<\/dc:creator>/s) { $cat{author} = [ &split_field($1) ]; } if (/(.*)<\/dc:contributor>/s) { $cat{contributor} = [ &split_field($1) ]; } if (/(.*)<\/dcterms:LCC>/s) { $cat{call} = [ &split_field($1) ]; } if (/(.*)<\/dcterms:LCSH>/s) { $cat{subject} = [ &split_field($1) ]; } if (/(.*)<\/dc:alternative>/is) { my $string = $1; $string =~ s/\s+/ /gs; $cat{alternative} = [ &split_field($string) ]; } if (/(.*)<\/dc:description>/is) { my $string = $1; $string =~ s/\s+/ /gs; $cat{description} = [ &split_field($string) ]; } if (/(.*)<\/dc:tableOfContents>/is) { my $string = $1; $string =~ s/\n/ -- /gs; $string =~ s/\s+/ /gs; $cat{contents} = [ &split_field($string) ]; } # experimental kludge to determine genre # genre is the 'Literary form' specified in the 008 character pos. 33 GENRE: { if (/other (stories|tales)/i) { $cat{genre} = 'j'; last; } if (/(poems|poetry)/i) { $cat{genre} = 'p'; last; } if (/essays/i) { $cat{genre} = 'e'; last; } if (/letters/i) { $cat{genre} = 'i'; last; } if (/plays/i) { $cat{genre} = 'd'; last; } if (/fiction/i) { $cat{genre} = '1'; last; } $cat{genre} = '|'; } return 1; } sub split_field { # split a (possibly) multivalued field, returning values as an array. my $field = shift; # join multiple lines $field =~ s/>\s+([^<]+) 1; foreach my $title (@titles) { # if the title contains line breaks, then the first marks the start of a subtitle, and # any others are a mistake. $title =~ s/\n/ : /s; $title =~ s/\n/ /gs; $ind2 = nonfilingIndicator( $title, $language ); if ( $ut ) { # Uniform title $title =~ s{ (Catalan|Czech|Dutch|English|Esperanto|Finnish|French|German|Greek|Polish|Portuguese|Romany|Spanish)$} {|l$1}; push @trec, sprintf "240 1%1d|a%s", $ind2, $title; $ut = 0; } else { # main title $title =~ s/Other Stories/other stories/i; for ($title) { my ($a, $b) = split /:/, $title, 2; if ( defined $b ) { $b =~ s/^\s+//; if ( $b =~ s/^(and|or),{0,1} /$1, / ) { $title = "$a ;|b$b"; } elsif ($b) { $title = "$a :|b$b"; } } } $title =~ s/\s+/ /g; $resp = &resp_stmt(); if ($resp) { push @trec, sprintf "245 1%1d|a%s|h[electronic resource] /|c%s", $ind2, $title, $resp; } else { push @trec, sprintf "245 1%1d|a%s|h[electronic resource]", $ind2, $title; } } } push @trec, "260 |b$publisher,|c$pubyear"; #push @trec, "500 |aProject Gutenberg"; # Description foreach (@description) { push @trec, sprintf "500 |a%s", $_; } # Contents note if (@contents) { if ($#contents > 0) { # more than one element in contents! push @trec, sprintf "505 0 |a%s", (join '--', @contents); } else { push @trec, sprintf "505 0 |a%s", $contents[0]; } } # Rights management / copyright statement if ($rights) { push @trec, "506 |a$rights"; } else { push @trec, "506 |aFreely available."; } push @trec, "516 |a$type"; # Subject headings # Note: we're using the 653 "uncontrolled terms" field, not LCSH foreach (@subjects) { push @trec, sprintf "653 |a%s", $_; } foreach (@authors) { my ($name, $d, $q, $c, $role) = &munge_author( $_ ); $temp = "700 1 |a$name"; $temp .= "|c$c" if $c; $temp .= "|q$q" if $q; $temp .= ",|d$d" if $d; $temp .= ",|e$role" if $role; push @trec, $temp; } foreach (@contribs) { my ($name, $d, $q, $c, $role) = &munge_author( $_ ); $temp = "700 1 |a$name"; $temp .= "|c$c" if $c; $temp .= "|q$q" if $q; $temp .= ",|d$d" if $d; $temp .= ",|e$role" if $role; push @trec, $temp; } foreach (@alternative) { push @trec, sprintf "740 0 |a%s", $_; } push @trec, sprintf "830 0|aProject Gutenberg|v%d", $id; push @trec, sprintf "856 40|uhttp://www.gutenberg.org/ebooks/%d", $id; push @trec, "856 42|uhttp://www.gutenberg.org/license|3Rights" unless $rights; return @trec; } sub nonfilingIndicator { # set non-filing indicator for title. my ($title, $language) = @_; my $ind2 = 0; ARTICLE_CASE: { if ($language eq 'en') { # English $ind2 = 4 if $title =~ /^The /; $ind2 = 3 if $title =~ /^An /; $ind2 = 2 if $title =~ /^A /; last; } if ($language eq 'fr') { # French $ind2 = 4 if $title =~ /^(Les|Une) /; $ind2 = 3 if $title =~ /^(Un|Le|La) /; $ind2 = 2 if $title =~ /^L'/; last; } if ($language eq 'de') { # German $ind2 = 5 if $title =~ /^Eine /; $ind2 = 4 if $title =~ /^(Ein|Der|Das|Die) /; last; } if ($language eq 'du') { # Dutch $ind2 = 4 if $title =~ /^(Een|Ene|Het) /; $ind2 = 3 if $title =~ /^De /; last; } if ($language eq 'es') { # Spanish $ind2 = 4 if $title =~ /^(Las|Los) /; $ind2 = 3 if $title =~ /^El /; last; } if ($language eq 'it') { # Italian $ind2 = 3 if $title =~ /^Il /; last; } if ($language eq 'pt') { # Portuguese $ind2 = 4 if $title =~ /^(Uma) /; $ind2 = 3 if $title =~ /^(As|Os|Um) /; $ind2 = 2 if $title =~ /^(A|O) /; last; } } return $ind2; } sub munge_author { my $name = shift; my ($d, $q, $c, $role); # extract and discard role -- between [] if ($name =~ s/ \[([^\]]+)\]//) { $role = "\L$1\E"; } # extract the dates (if any) and discard from name # dates are assumed as anything starting with a digit if ($name =~ s/, ([1-9-].+)//) { $d = $1; } # extract and discard any expanded forenames -- these will be in () if ($name =~ s/ (\(.+\))//) { $q = $1; } # extract and discard title if ($name =~ s/ (Sir|Lord|Mrs|Rev|Saint|Dr|Jr)\b\.{0,1}//) { $c = $1; } return ($name, $d, $q, $c, $role); } sub resp_stmt { # generate a statement of responsibility from the author fields my ($author, $name, $role, $resp); my @authors = @{ $cat{author} } if $cat{author}; my @contribs = @{ $cat{contributor} } if $cat{contributor}; $resp = ''; # first author ... $author = shift @authors; if ($author) { ($name, undef) = munge_author( $author ); if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; } $resp = "$name"; } # followed by any additional authors ... while ($author = shift @authors) { ($name, undef) = munge_author( $author ); if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; } if (@authors) { $resp .= ", $name"; } else { $resp .= " and $name"; } } # followed by contributors ... foreach $author (@contribs) { ($name, undef, undef, undef, $role) = munge_author( $author ); next unless $role; if ($name =~ /(.*?), (.*)/) { $name = "$2 $1"; } while (defined $role) { if ($role =~ /edit/i) { $resp .= "; edited by $name"; last; } if ($role =~ /Trans/i) { $resp .= "; translated by $name"; last; } if ($role =~ /Illus/i) { $resp .= "; illustrated by $name"; last; } #$resp .= "; $name"; #if ($role) { $resp .= " ($role)"; } last; # only one pass thru required! } } $resp =~ s/\s+/ /g; return $resp; } #----------------------------------------------------------------------- sub today { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); return sprintf "%4d-%02d-%02d.%02d%02d", $year+1900, $mon+1, $mday, $hour, $min; } sub timestamp { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); return sprintf "%4d%02d%02d%02d%02d%02d.0", $year+1900, $mon+1, $mday, $hour, $min, $sec; } #----------------------------------------------------------------------- sub array2marc { my @trec = @_; # initialise stuff my $offset = 0; my $dir = ''; my $data = ''; # default pattern for leader my $ldrpat = "%05dnas 22%05duu 4500"; my ($line, $field, $tag, $fldlen, $base); # if a leader is included, build the pattern from that ... if ( $trec[0] =~ /^LDR/ ) { # leader codes $line = shift(@trec); # use the leader to create a pattern for building the leader later on # only the RS, MC, BLC, EL, DCC and Linked are used $ldrpat = '%05d'.substr($line,9,5).'22%05d'.substr($line,21,3).'4500'; } # process all the tags in sequence foreach $line ( @trec ) { # build the directory and data portions $tag = substr($line, 0, 3); $field = substr($line, 4); # get the data for the tag unless ($tag lt '010') { $field =~ tr/\|/\037/s; # change subfield delimiter(s) } $field =~ s/$/\036/; # append a field terminator $fldlen = length($field); $dir .= sprintf("%3s%04d%05d",$tag,$fldlen,$offset); $offset += $fldlen; $data .= $field; } # append a field terminator to the directory $dir =~ s/$/\036/; # append the record terminator $data =~ s/$/\035/; # compute lengths $base = length($dir) + 24; # base address of data my $lrl = $base + length($data); # logical record length # return the complete MARC record return (sprintf $ldrpat,$lrl,$base) # leader . $dir # directory . $data; # data } __END__ =head1 NAME pgrdf2marc.pl =head1 DESCRIPTION pgrdf2marc.pl converts one or more items from the Project Gutenberg RDF catalog into MARC format record(s). The RDF is read from STDIN, and the MARC output to STDOUT. Dublin Core tags used in the RDF are: dc:title dc:alternative dc:creator dc:contributor dc:tableOfContents dc:publisher dc:rights dc:language dc:created dc:type dcterms:LCSH dcterms:LCC A MARC record is simply an ASCII string of arbitrary length. =head2 MARC record structure Leader: start: 0 length: 24 Base Address (start of data): start: 12 length: 5 Directory: start: 24, length: (base - 24) Tag number: 3 bytes data length: 4 bytes data offset: 5 bytes Subfields begin with 0x1f Fields end with 0x1e Records end with 0x1d =head2 Array element structure The conversion process makes use of a simple array structure, where each array element contains the tag and data for a single MARC field, separated by a single space. cols. 0-2 : tag number col. 3 : blank cols. 4-5 : indicators cols. 6- : tag data e.g. 245 10|aSome title|h[GMD] The '|' character is used to represent MARC subfield separators (0x1f). =head1 REFERENCES MARC Standards, http://www.loc.gov/marc/ Dublin Core/MARC/GILS Crosswalk, http://www.loc.gov/marc/dccross.html =head1 VERSION Version 2011-01-15 =head1 AUTHOR Steve Thomas =head1 LICENCE Copyright (c) 2004-2011 Steve Thomas Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. =cut __END__ &pg; Geordie's Tryst A Tale of Scottish Life Rae, Mrs. Milne Geordie's Tryst by Mrs. Milne Rae en 2004-06-28 #!/usr/bin/perl -w use Fcntl; use GDBM_File; use NDBM_File; tie %newhash, 'GDBM_File', 'newdb', O_CREAT|O_RDWR, 0644; tie %oldhash, 'NDBM_File', "marcdb", 1, 0; %newhash = %oldhash; untie %newhash; untie %oldhash;