javascript - Using Zotero translators to get author affiliation -
i'm working on project need author affiliation articles published on dblp. i'm setting translation server can them in github , following other instructions also.
then set connection in java-program this:
import java.io.bufferedreader; import java.io.inputstreamreader; import java.io.outputstreamwriter; import java.net.httpurlconnection; import java.net.url; import java.net.urlconnection; import org.json.*; public class zoterohandler { //function runing scan public static void scan(article article) throws exception { //setting url httpurlconnection given doi url urldoi = new url (article.getelectronicedition()); httpurlconnection conndoi = (httpurlconnection) urldoi.openconnection(); // make logic below easier detect redirections conndoi.setinstancefollowredirects(false); string doi = "{\"url\"smiley unsure"" + conndoi.getheaderfield("location") + "\",\"sessionid\"smiley unsure"abc123\"}"; //setting url translation-server url url = new url("http://127.0.0.1:1969/web"); urlconnection conn = url.openconnection(); conn.setdooutput(true); conn.setrequestproperty("content-type", "application/json"); outputstreamwriter writer = new outputstreamwriter(conn.getoutputstream()); writer.write(doi); writer.flush(); string line; bufferedreader reader = new bufferedreader(new inputstreamreader(conn.getinputstream())); while ((line = reader.readline()) != null ) { //used see of stream system.out.println(line); //incoming jsonarray, create new array, fill parse jsonarray jsonarr = new jsonarray(line); jsonobject obj = jsonarr.getjsonobject(0); //getting abstracts string abstracts = obj.getstring("abstractnote"); system.out.println(abstracts); //setting information in db article.setabstracts(abstracts); databasehandler.getinstance().updatearticle(article); } writer.close(); reader.close(); //need disconnect? //((httpurlconnection) conn).disconnect(); //conndoi.disconnect(); }
and far good. i'm getting information want , store in abstracts string , set in out database. need author affiliation. need somehow modify translation script i'm using.
this script:
{ "translatorid": "5af42734-7cd5-4c69-97fc-bc406999bdba", "label": "atypon journals", "creator": "sebastian karcher", "target": "^https?://[^?#]+(?:/doi/((?:abs|abstract|full|figure|ref|citedby|book)/)?10\\.|/action/dosearch\\?)|^https?://[^/]+/toc/", "minversion": "3.0", "maxversion": "", "priority": 270, "inrepository": true, "translatortype": 4, "browsersupport": "gcsibv", "lastupdated": "2015-10-15 22:24:05" } /* atypon journals translator copyright (c) 2011-2014 sebastian karcher program free software: can redistribute and/or modify under terms of gnu general public license published free software foundation, either version 3 of license, or (at option) later version. program distributed in hope useful, without warranty; without implied warranty of merchantability or fitness particular purpose. see gnu general public license more details. should have received copy of gnu general public license along program. if not, see <http://www.gnu.org/licenses/>. */ function detectweb(doc, url) { if (url.search(/^https?:\/\/[^\/]+\/toc\/|\/action\/dosearch\?/) != -1) { return getsearchresults(doc, true) ? "multiple" : false; } var citlinks = zu.xpath(doc, '//a[contains(@href, "/action/showcitformats")]'); if (citlinks.length > 0) { if (url.indexof('/doi/book/') != -1) { return 'book'; } else if (url.search(/\.ch\d+$/)!=-1){ return 'booksection'; } return "journalarticle"; } } function getsearchresults(doc, checkonly, extras) { var articles = {}; var container = doc.getelementsbyname('frmsearchresults')[0] || doc.getelementsbyname('frmabs')[0]; if (!container) { z.debug('atypon: multiples container not found.'); return false; } var rows = container.getelementsbyclassname('articleentry'), found = false, doilink = 'a[contains(@href, "/doi/abs/") or contains(@href, "/doi/abstract/") or ' + 'contains(@href, "/doi/full/") or contains(@href, "/doi/book/")]'; (var = 0; i<rows.length; i++) { var title = rows[i].getelementsbyclassname('art_title')[0]; if (!title) continue; title = zu.triminternal(title.textcontent); var urlrow = rows[i]; var url = zu.xpathtext(urlrow, '(.//' + doilink + ')[1]/@href'); if (!url) { // e.g. http://pubs.rsna.org/toc/radiographics/toc/33/7 shows links in adjacent div urlrow = rows[i].nextelementsibling; if (!urlrow || urlrow.classlist.contains('articleentry')) continue; url = zu.xpathtext(urlrow, '(.//' + doilink + ')[1]/@href'); } if (!url) continue; if (checkonly) return true; found = true; if (extras) { extras[url] = { pdf: buildpdfurl(url, urlrow) }; } articles[url] = title; } if (!found){ z.debug("trying alternate multiple format"); var rows = container.getelementsbyclassname("item-details"); (var = 0; i<rows.length; i++) { var title = zu.xpathtext(rows[i], './h3'); if (!title) continue; title = zu.triminternal(title); var url = zu.xpathtext(rows[i], '(.//ul[contains(@class, "icon-list")]/li/' + doilink + ')[1]/@href'); if (!url) continue; if (checkonly) return true; found = true; if (extras) { extras[url] = { pdf: buildpdfurl(url, rows[i]) }; } articles[url] = title; } } return found ? articles : false; } // keep in line target regexp var replurlregexp = /\/doi\/((?:abs|abstract|full|figure|ref|citedby|book)\/)?/; function buildpdfurl(url, root) { if (!replurlregexp.test(url)) return false; // whole thing going fail anyway var pdfpaths = ['/doi/pdf/', '/doi/pdfplus/']; (var i=0; i<pdfpaths.length; i++) { if (zu.xpath(root, './/a[contains(@href, "' + pdfpaths[i] + '")]').length) { return url.replace(replurlregexp, pdfpaths[i]); } } z.debug('pdf link not found.') if (root.nodetype != 9 /*document_node*/) { z.debug('available links:'); var links = root.getelementsbytagname('a'); if (!links.length) z.debug('no links'); (var i=0; i<links.length; i++) { z.debug(links[i].href); } } return false; } function doweb(doc, url) { if (detectweb(doc, url) == "multiple") { var extras = {}; zotero.selectitems(getsearchresults(doc, false, extras), function (items) { if (!items) { return true; } var articles = []; (var itemurl in items) { articles.push({ url: itemurl.replace(/\?prev.+/, ""), extras: extras[itemurl] }); } fetcharticles(articles); }); } else { scrape(doc, url, {pdf: buildpdfurl(url, doc)}); } } function fixcase(str, titlecase) { if (str.touppercase() != str) return str; if (titlecase) { return zu.capitalizetitle(str, true); } return str.charat(0) + str.substr(1).tolowercase(); } function fetcharticles(articles) { if (!articles.length) return; var article = articles.shift(); zu.processdocuments(article.url, function(doc, url) { scrape(doc, url, article.extras); }, function() { if (articles.length) fetcharticles(articles); }); } function scrape(doc, url, extras) { url = url.replace(/[?#].*/, ""); var doi = url.match(/10\.[^?#]+/)[0]; var citationurl = url.replace(replurlregexp, "/action/showcitformats?doi="); var abstract = doc.getelementsbyclassname('abstractsection')[0]; //var authoraffiliation = doc.getelementsbyclassname('listgroup')[0]; var tags = zu.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "keyword=")]'); z.debug("citation url: " + citationurl); zu.processdocuments(citationurl, function(citationdoc){ var filename = citationdoc.evaluate('//form//input[@name="downloadfilename"]', citationdoc, null, xpathresult.any_type, null).iteratenext().value; z.debug("filename: " + filename); var = '/action/downloadcitation'; var post = 'doi=' + doi + '&downloadfilename=' + filename + '&format=ris&direct=true&include=cit'; zu.dopost(get, post, function (text) { //z.debug(text); var translator = zotero.loadtranslator("import"); // calling ris translator translator.settranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); translator.setstring(text); translator.sethandler("itemdone", function (obj, item) { // titles , authros in caps item.title = fixcase(item.title); (var i=0; i<item.creators.length; i++) { item.creators[i].lastname = fixcase(item.creators[i].lastname, true); if (item.creators[i].firstname) { item.creators[i].firstname = fixcase(item.creators[i].firstname, true); } } item.url = url; //for emerald, rid of "null" add @ end of every title: if (url.indexof("www.emeraldinsight.com")!=-1){ item.title = item.title.replace(/null$/, "") } item.notes = []; (var in tags) { item.tags.push(tags[i].textcontent) } if (abstract) { // drop "abstract" prefix // not excellent, since abstracts // conceivably begin word "abstract" item.abstractnote = abstract.textcontent .replace(/^\s*abstract\s*/i, ''); } item.attachments = []; if (extras.pdf) { item.attachments.push({ url: extras.pdf, title: "full text pdf", mimetype: "application/pdf" }); } item.attachments.push({ document: doc, title: "snapshot", mimetype: "text/html" }); item.librarycatalog = url.replace(/^https?:\/\/(?:www\.)?/, '') .replace(/[\/:].*/, '') + " (atypon)"; item.complete(); }); translator.translate(); }); }) }
so there can tell how need update script can author affiliation? know script suppose go html-class "listgroup" find auhtor affiliation.
if need more information here link zotero translators available here , zotero:
i solved problem doing this:
function scrape(doc, url, extras) { url = url.replace(/[?#].*/, ""); var doi = url.match(/10\.[^?#]+/)[0]; var citationurl = url.replace(replurlregexp, "/action/showcitformats?doi="); //testing var affiliations = []; var affiliation = doc.getelementsbyclassname('listgroup'); var abstract = doc.getelementsbyclassname('abstractsection')[0]; var tags = zu.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "keyword=")]'); z.debug("citation url: " + citationurl); zu.processdocuments(citationurl, function(citationdoc){ var filename = citationdoc.evaluate('//form//input[@name="downloadfilename"]', citationdoc, null, xpathresult.any_type, null).iteratenext().value; z.debug("filename: " + filename); var = '/action/downloadcitation'; var post = 'doi=' + doi + '&downloadfilename=' + filename + '&format=ris&direct=true&include=cit'; zu.dopost(get, post, function (text) { //z.debug(text); var translator = zotero.loadtranslator("import"); // calling ris translator translator.settranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); translator.setstring(text); translator.sethandler("itemdone", function (obj, item) { // titles , authros in caps item.title = fixcase(item.title); (var i=0; i<item.creators.length; i++) { item.creators[i].lastname = fixcase(item.creators[i].lastname, true); if (item.creators[i].firstname) { item.creators[i].firstname = fixcase(item.creators[i].firstname, true); } } item.url = url; //for emerald, rid of "null" add @ end of every title: if (url.indexof("www.emeraldinsight.com")!=-1){ item.title = item.title.replace(/null$/, "") } item.notes = []; (var in tags){ item.tags.push(tags[i].textcontent) } if (abstract) { // drop "abstract" prefix // not excellent, since abstracts // conceivably begin word "abstract" item.abstractnote = abstract.textcontent .replace(/^\s*abstract\s*/i, ''); } item.attachments = []; if (extras.pdf) { item.attachments.push({ url: extras.pdf, title: "full text pdf", mimetype: "application/pdf" }); } item.attachments.push({ document: doc, title: "snapshot", mimetype: "text/html" }); item.librarycatalog = url.replace(/^https?:\/\/(?:www\.)?/, '') .replace(/[\/:].*/, '') + " (atypon)"; //affiliations (i=0; i<affiliations.length; i++) { affiliation.push(affiliations[i].textcontent) } item.extra = affiliation.join("; "); item.complete(); }); translator.translate(); }); })
i created array called affiliations , variabel called affiliation. fill array string got , store in field in zotero called extra, because zotero doesn't have special field author affiliaton. hack can affiliation program
Comments
Post a Comment