Example of Store and Extract functions for Setting up Crawler Jobs for Semantic Sitemaps -- a variation of standard sitemap

Example of Extract Function


use WS;

create procedure WS.WS.SITEMAP_BB_PARSE (
  in _host varchar, 
  in _url varchar, 
  in _root varchar, 
  inout _content varchar,  
  in _c_type varchar := null, 
  in lev int := 0))
{
  --pl_debug+
  declare xt, xp, graph any;
  declare inx int;

--  dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', _url);

  declare exit handler for sqlstate '*'
    {
--      dbg_obj_print (__SQL_MESSAGE);
      return;
    };

  if (_url like '%.xml.gz')
    {
      _content := gzip_uncompress (_content); 
    }

  if (_url like '%.xml' or _url like '%.xml.gz' or _url like '%.rdf')
    {
      xt := xtree_doc (_content);
      if (xpath_eval ('/urlset/dataset', xt) is not null)
	{
	  xp := xpath_eval ('/urlset/dataset/dataDumpLocation/text()', xt, 0);
	  graph := cast (xpath_eval ('/urlset/dataset/datasetURI/text()', xt) as varchar);
	  if (length (graph))
	    update VFS_SITE set VS_UDATA = serialize (vector ('graph', graph)) where VS_HOST = _host and VS_ROOT = _root;
	  inx := 0;
	  foreach (any u in xp) do
	    {
	      declare hf, host, url varchar;

	      u := cast (u as varchar);
	      hf := WS.WS.PARSE_URI (u);
	      host := hf[1];
	      --dbg_obj_print ('WS.WS.GET_URLS_SITEMAP PARSE', u);
	      url := hf[2];
	      insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
		  values (host, now (), url, 'waiting', _root, NULL); 
	      if (row_count () = 0)
		update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
	      inx := inx + 1;
	    }
	}
      if (xpath_eval ('/sitemapindex/sitemap/loc', xt) is not null)
	{
	  xp := xpath_eval ('/sitemapindex/sitemap/loc/text()', xt, 0);
	  inx := 0;
	  foreach (any u in xp) do
	    {
	      declare hf, host, url varchar;

	      u := trim (cast (u as varchar));
	      hf := WS.WS.PARSE_URI (u);
	      host := hf[1];
--	      dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', host, _host);
	      url := hf[2];
	      if (url <> '')
		{
		  insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
		      values (host, now (), url, 'waiting', _root, NULL); 
		  if (row_count () = 0)
		    update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
		  inx := inx + 1;
		}
	    }
	}
    }
  commit work;
}
;

Example of Store Function


use WS;

create procedure WS.WS.SITEMAP_BB_STORE (
  in _host varchar, 
  in _url varchar, 
  in _root varchar,
  inout _content varchar, 
  in _s_etag varchar, 
  in _c_type varchar,
  in store_flag int := 1, 
  in udata any := null,
  in lev int := 0)
{
  --pl_debug+
  declare graph varchar;

--  dbg_obj_print ('WS.WS.SITEMAP_BB_STORE', _url, udata);
  if (isarray (udata))
    graph := get_keyword ('graph', udata);
  else  
    graph := null;

  if (graph is not null and _url like '%.rdf')
    {
      DB.DBA.RDF_LOAD_RDFXML (_content, graph, graph);
      DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ ();
    }
  insert soft VFS_URL (VU_HOST, VU_URL, VU_CHKSUM, VU_CPTIME, VU_ETAG, VU_ROOT)
      values (_host, _url, md5 (_content), now (), _s_etag, _root);
  if (row_count () = 0)
    update VFS_URL set VU_CHKSUM = md5 (_content), VU_CPTIME = now (), VU_ETAG = _s_etag where
	VU_HOST = _host and VU_URL = _url and VU_ROOT = _root;
  commit work;
}
;

Related