/* * rdf-smush.c - Smush a persistent Redland storage by owl:IFPs. * * Based on the work by mattb and libby. * * Copyright (C) 2003-2004 Morten Frederiksen - http://purl.org/net/morten/ * * and * * Copyright (C) 2000-2003 David Beckett - http://purl.org/net/dajobe/ * Institute for Learning and Research Technology - http://www.ilrt.org/ * University of Bristol - http://www.bristol.ac.uk/ * * This package is Free Software or Open Source available under the * following licenses (these are alternatives): * 1. GNU Lesser General Public License (LGPL) * 2. GNU General Public License (GPL) * 3. Mozilla Public License (MPL) * * See LICENSE.html or LICENSE.txt at the top of this package for the * full license terms. * */ #include #include #include #include #include #include const char *VERSION="0.10"; struct options { char *bulk; long int cache; librdf_node *context; char *database; char *directory; char *host; librdf_node *ifp; int list; char *model; int port; char *password; int quiet; int test; char *user; } opts; int main(int argc,char *argv[]); void stamp(char *argv0,char *t0); int smush(char *argv0,char *t0,librdf_world *world,librdf_model *model,librdf_node *ifp); int rewrite(librdf_world *world, librdf_model *model, librdf_node *canonical, librdf_node *node, librdf_statement *search); int getoptions(int argc,char *argv[],librdf_world *world); int usage(char *argv0,int version); int main(int argc,char *argv[]) { /* Redland objects. */ librdf_world *world; librdf_storage *storage; librdf_model *model; int argnum; char *storage_type; char *storage_options; char *t0; t0=malloc(strlen(argv[0])+31); stamp(argv[0],t0); /* Create rdflib world. */ if (!(world=librdf_new_world())) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to create Redland world\n",t0); return(1); } librdf_world_open(world); /* Parse command line options (if possible) and check for context. */ argnum=getoptions(argc,argv,world); /* Set storage options. */ if (opts.database) { storage_type=strdup("mysql"); storage_options=malloc(strlen(opts.host)+strlen(opts.database)+strlen(opts.user)+strlen(opts.password)+120); if (!storage_type || !storage_options) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to create 'mysql' storage options\n",t0); return(1); } sprintf(storage_options,"merge='yes',bulk='%s',nodecache='%li',host='%s',database='%s',port='%i',user='%s',password='%s',contexts='yes',write='yes'", opts.bulk,opts.cache,opts.host,opts.database,opts.port,opts.user,opts.password); } else { storage_type=strdup("hashes"); storage_options=malloc(strlen(opts.directory)+120); if (!storage_type || !storage_options) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to create 'hashes' storage options\n",t0); return(1); } sprintf(storage_options,"hash-type='bdb',dir='%s',contexts='yes',write='yes'", opts.directory); } /* Create storage. */ if (!(storage=librdf_new_storage(world,storage_type,opts.model,storage_options))) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to create storage (%s/%s/%s)\n",t0,storage_type,opts.model,storage_options); return(1); } /* Create model. */ if (!(model=librdf_new_model(world,storage,NULL))) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to create model\n",t0); return(1); } if (librdf_model_size(model)!=-1 && !opts.quiet) { stamp(argv[0],t0); fprintf(stdout, "%s: * Model '%s' contains %d statements.\n",t0,opts.model,librdf_model_size(model)); fflush(stdout); } /* Smush - a single context or all, internal IFPs or given? */ if (opts.ifp) { smush(argv[0],t0,world,model,opts.ifp); } else { librdf_stream *ifpstream; librdf_statement *ifpstatement; /* Find all owl:IFPs. */ if (!(ifpstatement=librdf_new_statement_from_nodes(world,NULL, librdf_new_node_from_uri_string(world,"http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), librdf_new_node_from_uri_string(world,"http://www.w3.org/2002/07/owl#InverseFunctionalProperty")))) { stamp(argv[0],t0); fprintf(stderr, "%s: Unable to query for IFPs in model '%s'\n",t0,opts.model); return(1); } if (!(ifpstream=librdf_model_find_statements(model,ifpstatement))) { stamp(argv[0],t0); fprintf(stderr, "%s: Failed to find any IFPs in model '%s'\n",t0,opts.model); return(1); } while (!librdf_stream_end(ifpstream)) { librdf_node *ifp=librdf_new_node_from_node(librdf_statement_get_subject(librdf_stream_get_object(ifpstream))); smush(argv[0],t0,world,model,ifp); librdf_free_node(ifp); librdf_stream_next(ifpstream); } librdf_free_stream(ifpstream); librdf_free_statement(ifpstatement); } librdf_model_sync(model); if (!opts.quiet) { stamp(argv[0],t0); fprintf(stdout, "%s: * Done.\n",t0); fflush(stdout); } /* Clean up. */ if (opts.context) librdf_free_node(opts.context); if (opts.ifp) librdf_free_node(opts.ifp); librdf_free_model(model); librdf_free_storage(storage); librdf_free_world(world); free(storage_options); free(storage_type); free(t0); /* keep gcc -Wall happy */ return(0); } void stamp(char *argv0,char *t0) { time_t now=time(NULL); strcpy(t0,argv0); strcat(t0," ["); strftime(&t0[strlen(t0)],strlen(argv0)+31-strlen(t0),"%FT%TZ",gmtime(&now)); strcat(t0,"]"); } librdf_statement *ifp_filter(librdf_stream *stream, void *map_context, librdf_statement *statement) { librdf_node *ifp=(librdf_node*)map_context; if (librdf_node_equals(librdf_statement_get_predicate(statement),ifp)) return statement; return NULL; } int smush(char *argv0,char *t0,librdf_world *world,librdf_model *model,librdf_node *ifp) { librdf_storage *tempstorage; librdf_model *tempmodel; librdf_stream *ostream, *sstream, *rstream; librdf_statement *osearch=NULL, *ssearch, *rewrites; librdf_node *object, *prevobject=NULL, *canonical=NULL, *subject, *prevsubject=NULL; int rc; if (!opts.quiet || opts.list) { stamp(argv0,t0); char *p=librdf_node_to_string(ifp); fprintf(stdout, "%s: * Smushing on %s...\n",t0,p); fflush(stdout); free(p); if(opts.list) return 0; } /* Create in-memory storage/model to hold rewrites, to avoid deadlocks. */ if (!(tempstorage=librdf_new_storage(world,"memory","dummy",""))) { stamp(argv0,t0); fprintf(stderr, "%s: Failed to create temporary storage\n",t0); return 1; } if (!(tempmodel=librdf_new_model(world,tempstorage,NULL))) { stamp(argv0,t0); fprintf(stderr, "%s: Failed to create temporary model\n",t0); return 2; } /* Find objects of statements involving IFP. */ if (opts.context) { ostream=librdf_model_context_as_stream(model,opts.context); if (ostream) librdf_stream_add_map(ostream,ifp_filter,NULL,ifp); } else { if (!(osearch=librdf_new_statement_from_nodes(world,NULL, librdf_new_node_from_node(ifp),NULL))) return 3; ostream=librdf_model_find_statements(model,osearch); } /* Loop through all objects of IFP (possibly only in given context) */ while (ostream && !librdf_stream_end(ostream)) { canonical=NULL; if (!(object=librdf_new_node_from_node(librdf_statement_get_object(librdf_stream_get_object(ostream))))) return 4; /* Skip empty literal values, a source of error, and duplicates... */ if ((!prevobject || !librdf_node_equals(object,prevobject)) && (librdf_node_get_type(object)!=LIBRDF_NODE_TYPE_LITERAL || strlen(librdf_node_get_literal_value(object)))) { if (prevobject) librdf_free_node(prevobject); if (!(prevobject=librdf_new_node_from_node(object))) return 5; /* Find all subjects with given value for IFP. */ if (!(ssearch=librdf_new_statement_from_nodes(world,NULL, librdf_new_node_from_node(ifp), librdf_new_node_from_node(object)))) return 6; sstream=librdf_model_find_statements(model,ssearch); /* Check not really necessary, but still... */ if (!sstream || librdf_stream_end(sstream)) { librdf_free_statement(ssearch); if (sstream) librdf_free_stream(sstream); librdf_stream_next(ostream); continue; } /* Save first rewritable candidate subject as canonical. */ while (!librdf_stream_end(sstream)) { if (!canonical) { canonical=librdf_new_node_from_node(librdf_statement_get_subject(librdf_stream_get_object(sstream))); } else { librdf_node *candidate=librdf_new_node_from_node(librdf_statement_get_subject(librdf_stream_get_object(sstream))); if ((librdf_node_get_type(candidate)==LIBRDF_NODE_TYPE_RESOURCE && librdf_node_get_type(canonical)!=LIBRDF_NODE_TYPE_RESOURCE) || (librdf_node_get_type(candidate)==LIBRDF_NODE_TYPE_RESOURCE && librdf_node_get_type(canonical)==LIBRDF_NODE_TYPE_RESOURCE && strlen(librdf_node_to_string(candidate)) < strlen(librdf_node_to_string(canonical)))) { canonical=librdf_new_node_from_node(librdf_statement_get_subject(librdf_stream_get_object(sstream))); } librdf_free_node(candidate); } librdf_stream_next(sstream); } librdf_free_stream(sstream); sstream=librdf_model_find_statements(model,ssearch); if (!opts.quiet) { stamp(argv0,t0); char *o=librdf_node_to_string(object); char *c=librdf_node_to_string(canonical); // fprintf(stdout, "%s: %s %s\n",t0,o,c); fprintf(stdout, "."); fflush(stdout); free(o); free(c); } /* Rewrite other nodes to canonical. */ while (!librdf_stream_end(sstream)) { subject=librdf_new_node_from_node(librdf_statement_get_subject(librdf_stream_get_object(sstream))); /* No need to rewrite canonical and duplicates... */ if (librdf_node_equals(canonical,subject) || (prevsubject && librdf_node_equals(subject,prevsubject))) { librdf_free_node(subject); librdf_stream_next(sstream); continue; } if (prevsubject) librdf_free_node(prevsubject); if (!(prevsubject=librdf_new_node_from_node(subject))) return 7; if (!opts.quiet) { stamp(argv0,t0); char *s=librdf_node_to_string(subject); // fprintf(stdout, "%s: %s\n",t0,s); // fflush(stdout); free(s); } /* Add relation to temporary model. */ if (!(rewrites=librdf_new_statement_from_nodes(world, librdf_new_node_from_node(subject), librdf_new_node(world), librdf_new_node_from_node(canonical)))) return 8; if (librdf_model_add_statement(tempmodel,rewrites)) return 9; librdf_free_statement(rewrites); librdf_free_node(subject); librdf_stream_next(sstream); } librdf_free_node(canonical); librdf_free_stream(sstream); librdf_free_statement(ssearch); if (prevsubject) librdf_free_node(prevsubject); prevsubject=NULL; } librdf_free_node(object); librdf_stream_next(ostream); } if (prevobject) librdf_free_node(prevobject); if (ostream) librdf_free_stream(ostream); if (osearch) librdf_free_statement(osearch); if(librdf_model_size(tempmodel)) { if (!opts.quiet) { stamp(argv0,t0); char *p=librdf_node_to_string(ifp); fprintf(stdout, "%s: * Performing %d rewrites for %s...\n",t0,librdf_model_size(tempmodel),p); fflush(stdout); free(p); } /* Find and perform queued rewrites. */ rstream=librdf_model_as_stream(tempmodel); while (rstream && !librdf_stream_end(rstream)) { subject=librdf_statement_get_subject(librdf_stream_get_object(rstream)); canonical=librdf_statement_get_object(librdf_stream_get_object(rstream)); if (opts.test || !opts.quiet) { char *s=librdf_node_to_string(subject); char *c=librdf_node_to_string(canonical); fprintf(stdout, "%s -> %s\n",s,c); fflush(stdout); free(s); free(c); if (opts.test) { librdf_stream_next(rstream); continue; } } /* Rewrite subjects... */ if (!(rewrites=librdf_new_statement_from_nodes(world, librdf_new_node_from_node(subject),NULL,NULL))) return 16; rc=rewrite(world,model,canonical,subject,rewrites); if (rc) return rc+16; librdf_free_statement(rewrites); /* Rewrite objects... */ if (!(rewrites=librdf_new_statement_from_nodes(world,NULL,NULL, librdf_new_node_from_node(subject)))) return 32; rc=rewrite(world,model,canonical,subject,rewrites); if (rc) return rc+32; librdf_free_statement(rewrites); /* Rewrite predicates? */ if (librdf_node_get_type(subject)==LIBRDF_NODE_TYPE_RESOURCE) { if (!(rewrites=librdf_new_statement_from_nodes(world,NULL, librdf_new_node_from_node(subject),NULL))) return 64; rc=rewrite(world,model,canonical,subject,rewrites); if (rc) return rc+64; librdf_free_statement(rewrites); } /* Add owl:SameAs if URIs */ if(librdf_node_get_type(subject)==LIBRDF_NODE_TYPE_RESOURCE && librdf_node_get_type(canonical)==LIBRDF_NODE_TYPE_RESOURCE) { librdf_node *sameuri = librdf_new_node_from_uri_string(world, "http://www.w3.org/2002/07/owl#sameAs"); librdf_statement *sameas = librdf_new_statement_from_nodes(world, canonical, sameuri, subject); librdf_model_add_statement(model, sameas); fprintf(stdout, "Adding owl:sameAs statement: %s owl:sameAs %s\n", librdf_node_to_string(canonical), librdf_node_to_string(subject)); librdf_free_statement(sameas); librdf_free_node(sameuri); } librdf_stream_next(rstream); } if (rstream) librdf_free_stream(rstream); } librdf_free_model(tempmodel); librdf_free_storage(tempstorage); return 0; }; int rewrite(librdf_world *world, librdf_model *model, librdf_node *canonical, librdf_node *node, librdf_statement *search) { librdf_stream *stream; librdf_statement *oldstatement, *newstatement; librdf_node *context, *object, *predicate, *subject; /* Rewrite rewrite to canonical in statements matching search. */ stream=librdf_model_find_statements(model,search); if (!stream) return 0; while (!librdf_stream_end(stream)) { oldstatement=librdf_stream_get_object(stream); context=librdf_stream_get_context(stream); if(context) context=librdf_new_node_from_node(context); /* Remove old statement involving node as subject. */ if(context) { if (librdf_model_context_remove_statement(model,context,oldstatement)) return 1; } else { if (librdf_model_remove_statement(model,oldstatement)) return 1; } /* Rewrite object? */ object=librdf_new_node_from_node(librdf_statement_get_object(oldstatement)); if (librdf_node_equals(object,node)) { librdf_free_node(object); object=librdf_new_node_from_node(canonical); } /* Rewrite predicate? */ predicate=librdf_new_node_from_node(librdf_statement_get_predicate(oldstatement)); if (librdf_node_equals(predicate,node)) { librdf_free_node(predicate); predicate=librdf_new_node_from_node(canonical); } /* Rewrite subject? */ subject=librdf_new_node_from_node(librdf_statement_get_subject(oldstatement)); if (librdf_node_equals(subject,node)) { librdf_free_node(subject); subject=librdf_new_node_from_node(canonical); } /* Insert new statement. */ if (!(newstatement=librdf_new_statement_from_nodes(world, subject,predicate,object))) return 2; if(context) { if (librdf_model_context_add_statement(model,context,newstatement)) return 3; } else { if (librdf_model_add_statement(model,newstatement)) return 3; } librdf_free_statement(newstatement); if(context) librdf_free_node(context); librdf_stream_next(stream); } librdf_free_stream(stream); return 0; } int getoptions(int argc,char *argv[],librdf_world *world) { /* Define command line options. */ struct option opts_long[]={ {"help",no_argument,NULL,'?'}, {"bulk",no_argument,NULL,'B'}, {"cache",required_argument,NULL,'C'}, {"database",required_argument,NULL,'s'}, {"directory",required_argument,NULL,'d'}, {"host",required_argument,NULL,'h'}, {"ifp",required_argument,NULL,'i'}, {"list",no_argument,NULL,'l'}, {"model",required_argument,NULL,'m'}, {"port",required_argument,NULL,'P'}, {"password",optional_argument,NULL,'p'}, {"quiet",no_argument,NULL,'q'}, {"test",no_argument,NULL,'t'}, {"user",required_argument,NULL,'u'}, {"version",no_argument,NULL,'v'}, {0,0,0,0}}; const char *opts_short="?BC:c:D:d:h:i:lm:P:p:qtu:v"; int i=1; char c; char *buffer; int ttypasswd=1; /* Set defaults. */ opts.cache=0; opts.context=NULL; opts.ifp=NULL; opts.list=0; opts.password=0; opts.port=3306; opts.quiet=0; opts.test=0; opts.user=0; if (!(opts.directory=strdup("./")) || !(opts.bulk=strdup("no")) || !(opts.host=strdup("mysql")) || !(opts.model=strdup("redland")) || !(opts.database=strdup("redland"))) { fprintf(stderr,"%s: Failed to allocate default options\n",argv[0]); exit(1); }; while ((c=getopt_long(argc,argv,opts_short,opts_long,&i))!=-1) { if (optarg) { buffer=malloc(strlen(optarg)+1); if (!buffer) { fprintf(stderr,"%s: Failed to allocate buffer for command line argument (%s)\n",argv[0],optarg); exit(1); }; strncpy(buffer,optarg,strlen(optarg)+1); }; switch (c) { case '?': usage(argv[0],0); case 'B': free(opts.bulk); opts.bulk=(char*)malloc(4); if (!opts.bulk) { fprintf(stderr,"%s: Failed to allocate buffer for bulk parameter\n",argv[0]); exit(1); }; strncpy(opts.bulk,"yes",4); break; case 'C': free(buffer); opts.cache=atol(optarg); break; case 'c': if (!(opts.context=librdf_new_node_from_uri_string(world,buffer))) { fprintf(stderr, "%s: Failed to create context node\n",argv[0]); usage(argv[0],0); } free(buffer); break; case 'D': free(opts.directory); opts.directory=0; opts.database=buffer; break; case 'd': free(opts.database); opts.database=0; opts.directory=buffer; break; case 'h': opts.host=buffer; break; case 'i': if (!(opts.ifp=librdf_new_node_from_uri_string(world,buffer))) { fprintf(stderr, "%s: Failed to create IFP node\n",argv[0]); usage(argv[0],0); } free(buffer); break; case 'l': opts.list=1; break; case 'm': opts.model=buffer; break; case 'P': free(buffer); opts.port=atoi(optarg); break; case 'p': opts.password=buffer; ttypasswd=0; break; case 'q': opts.quiet=1; break; case 't': opts.test=1; break; case 'u': opts.user=buffer; break; case 'v': usage(argv[0],1); default: fprintf(stderr,"%s: Invalid option (%c)\n",argv[0],c); usage(argv[0],0); } } /* Flag missing user name. */ if (opts.database && !opts.user) { fprintf(stderr,"%s: Missing user name for mysql storage\n",argv[0]); usage(argv[0],0); exit(1); }; /* Read password from tty if not specified. */ if (opts.database && ttypasswd) { char c2; int i2=0; opts.password=malloc(128); if (!opts.password) { fprintf(stderr,"%s: Failed to allocate buffer for password\n",argv[0]); exit(1); }; fprintf(stderr,"%s: Enter password for %s@%s/%s: ",argv[0],opts.user,opts.host,opts.database); while ((c2=getchar())!='\n') { opts.password[i2++]=c2; if (i2==127) break; }; opts.password[i2++]=0; }; return optind; } int usage(char *argv0,int version) { printf("\n\ %s Version %s\n\ Smush a persistent Redland storage by owl:IFPs.\n\ * Copyright (C) 2003-2004 Morten Frederiksen - http://purl.org/net/morten/\n\ * Copyright (C) 2000-2003 David Beckett - http://purl.org/net/dajobe/\n\ ",argv0,VERSION); if (version) exit(0); printf("\n\ usage: %s [options]\n\ \n\ -?, --help Display this help message and exit.\n\ -B, --bulk Optimize for bulk loads by locking tables and disabling\n\ indices while loading. Note that this excludes all other\n\ access to the storage.\n\ -C, --cache=\n\ Length of node cache list, to reduce extranous inserts.\n\ Default node cache list length is 64.\n\ -c, --context=\n\ URI of context to smush.\n\ -D, --database=\n\ Name of MySQL database to use, default is 'redland'.\n\ -d, --directory=\n\ Directory to use for BDB files. When provided implies use\n\ of 'hashes' storage type instead of 'mysql'.\n\ -h, --host=\n\ Host to contact for MySQL connections, default is 'mysql'.\n\ -i, --ifp=\n\ URI of owl:InverseFunctionalProperty to smush on. If not\n\ specified, all IFP's in model will be used.\n\ -l, --list\n\ Only list IFP's that would be used for smushing.\n\ -m, --id=\n\ Identifier for (name of) storage (model name for storage\n\ type 'mysql', base file name for storage type 'hashes'),\n\ default is 'redland'.\n\ -p, --password=\n\ Password to use when connecting to MySQL server.\n\ If password is not given it's asked from the tty.\n\ -P, --port=\n\ The port number to use when connecting to MySQL server.\n\ Default port number is 3306.\n\ -q, --quiet\n\ No informational messages, only errors.\n\ -t, --test\n\ Test only, don't perform rewrites, just list them as\n\ doubles (original and canonical).\n\ -u, --user=\n\ User name for MySQL server.\n\ -v, --version Output version information and exit.\n\ \n\ ",argv0); exit(1); }