#!/usr/bin/scl enable rh-python35 -- python3 LIBRARY_ID = '2183860' LIBRARY_TYPE = 'group' CACHE_DIR = '/home3/ethrift/zotero_query_cache' IMG_DIR = '/home3/ethrift/public_html/static/zotero_query_graphs' IMG_URL = 'https://driedfishmatters.org/static/zotero_query_graphs' """Zotero toolkit CGI script providing tools to support meta-analysis using a library managed in Zotero. Copyright 2019-2021, Eric Thrift This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import json import sys import cgi import codecs import base64 import hashlib import glob import os import textwrap import urllib.parse from pyzotero import zotero from cachier import cachier import cgitb cgitb.enable() # import BEFORE pandas (-->numpy) to avoid segfault os.environ["OPENBLAS_NUM_THREADS"] = "1" import pandas as pd import matplotlib.pyplot as plt plt.style.use('seaborn') plt.rcParams["figure.figsize"] = [8, 5] sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) sys.stderr = sys.stdout HTML_HEADER = """
""" QUERY_BUTTON = """

Revise query
Reset the form with current tag list(s) pre-loaded

""" HTML_FOOTER = """
""" FORM = """

Zotero tags query

This script will query the Zotero library to produce tables or graphs identifying the number of items associated with specific tags.
Provide a list of tags to include in the results, one per line. Provide a list of tags to include in the results, one per line. If both x and y axes are specified, the query results will include the number of items matching the intersection of each (x,y) pair. Provide a list of tags to to filter the results, one per line. These tags will not be included explicitly in the resulting dataset, but only items that match ALL the tags in this list will be included in the results. Use a hyphen prefix for tags to be excluded, e.g., -ignore to prevent items with the tag ignore from being listed in the results.

Graph options

SVG images (default) can be resized without loss of quality, but may not display properly in some applications. Use horizontal bar graphs if the labels are longer. Represent values in (x,y) unions as raw numbers (default). Represent values in (x,y) unions as a percentage of the total number of items in the library matching tag x. The total is adjusted to include only items matching the filter criteria given above. Represent values in (x,y) unions as a percentage of the total number of items in current result set. This will result in stacked bar graphs that add up to 100%. Sort data according to the y axis. (EXPERIMENTAL: Generally leave checked.) If both x and y are arrays, generate a series of graphs, one for each value in x. Print source values on each bar in a bar graph. Currently only works for horizontal bar graphs. Transpose x and y axes in the resulting table or graph. Present graph results in stacked format. Create a square graph.
""" def _strip(tag): new_tag = tag.strip('!@#$%^&*_+') if new_tag == '': # e.g., if the source tag is just "!" or "**" return tag return new_tag def editable_query(p): """Return a query dict with format=none to return the form.""" q = {'format': None} for i in ['filter', 'tags_x', 'tags_y']: if isinstance(p[i], list): # non-empty value q[i] = '\r\n'.join(p[i]) else: q[i] = '' # strings so the form inputs don't say "None" return q def parse_query(): """Parse the incoming cgi query, returning a dict.""" q = cgi.FieldStorage(keep_blank_values=True) # fields that take a single value simplefields = [ 'edit_query', 'format', 'values_type', 'graph_format', 'stack', 'purge', 'purge_data', 'purge_images', 'image_type', 'transpose', 'sort', 'subplots', 'square', 'label_bars', 'label_int'] # fields that should be split on newlines listfields = ['filter', 'tags_x', 'tags_y'] params = {} for k in simplefields + listfields: v = q.getvalue(k, None) if v and k in listfields: params[k] = v.split('\r\n') else: params[k] = v return params def get_count(tags_x, tag_filter, rows=True): """Generate a data table containing counts of items per tag. TAG_X is a list of tags, separated by newlines. """ if rows: out = [] else: out = {} for x in tags_x: if tag_filter: tags = frozenset([x] + tag_filter) else: tags = frozenset([x]) t = query_zotero(tags) if rows: out.append({'tag': _strip(x), 'count': len(t)}) else: out[_strip(x)] = len(t) return out @cachier(cache_dir=CACHE_DIR, pickle_reload=False) def query_zotero(tags): """Query Zotero. The tags parameter is an immutable set, so the function can be hashed and stored in a cache. The cache does not expire, so it needs to be invalidated using the purge() method.""" zot = zotero.Zotero(LIBRARY_ID, LIBRARY_TYPE) try: t = zot.everything(zot.items(tag=tags, format='versions', limit=None)) except: error('Zotero server error') return t def get_union(tags_x, tags_y, tag_filter): """Generate a data table containing an array of tag correlations. Each of TAG_X and TAG_Y is a list of tags, separated by newlines. Use the `filter` argument as a global filter to limit the results to items that match a specific tag or tags (this argument can be specified more than once, in which case ALL tags must be matched to be included in the result set). To exclude items that match a given tag, use a negative operator prefix (e.g., "-tag to exclude"). """ rows = [] for y in tags_y: row = {'tag': _strip(y)} for x in tags_x: if tag_filter: tags = frozenset([x,y] + tag_filter) else: tags = frozenset([x,y]) t = query_zotero(tags) row[_strip(x)] = len(t) rows.append(row) return rows def percentify(p, matches=False): """Construct a data table in which values are percentages of total. The "total" in this case is the overall number of items in the Zotero library matching each tag. """ raw = get_union(p['tags_x'], p['tags_y'], p['filter']) if matches: totals = {} for row in raw: totals[row['tag']] = sum([v for k,v in row.items() if k != 'tag']) else: totals = get_count(p['tags_x'], p['filter'], rows=False) percent = [] for row in raw: newrow = {'tag': row['tag']} for tag in row: if tag == 'tag': continue if matches: # don't use integers if we want the numbers to add up to 100 newrow[tag] = int(row[tag])/int(totals[row['tag']])*100 elif totals[tag] == 0: newrow[tag] = 0 else: newrow[tag] = int(row[tag])/int(totals[tag])*100 percent.append(newrow) return percent def get_data(p): """Execute Zotero queries and return a raw data table.""" if p['values_type'] == 'percent' and p['tags_x'] and p['tags_y']: return percentify(p) if p['values_type'] == 'percent_matches' and p['tags_x'] and p['tags_y']: return percentify(p, matches=True) if p['tags_x'] and p['tags_y']: return get_union(p['tags_x'], p['tags_y'], p['filter']) if p['tags_x']: return get_count(p['tags_x'], p['filter']) return error("no data") def error(message): """Exit with an error message.""" sys.exit(message) def hash_query(p): """Create a unique hash ID for the current query.""" enc = json.dumps(p, sort_keys=True).encode() return hashlib.sha1(enc).hexdigest() def graph(p): """Return an html img tag for a graph.""" h = hash_query(p) fmt = p['image_type'] or 'svg' img = '{}.{}'.format(h, fmt) path = os.path.join(IMG_DIR, img) url = '/'.join([IMG_URL, img]) if not os.path.exists(path): build_graph(p, path) if fmt in ('pdf'): return '
PDF: {}
'.format(url, h) return '
'.format(url, h) def build_graph(p, path): """Create a graph image and store on disk.""" df = dataframe(p) stacked = p['stack'] or False square = p['square'] or False label_bars = p['label_bars'] or False # this is the width of bars. At 100% they touch one another. width=0.85 if stacked: width=0.6 graph_format = p['graph_format'] or 'barh' if square: u = df.plot(kind=graph_format, width=width, stacked=stacked, rot=0, figsize=(4,4)) else: u = df.plot(kind=graph_format, width=width, stacked=stacked, rot=0) plt.xlabel('') plt.ylabel('') if label_bars and graph_format == 'barh': if p['label_int']: NUM = "{:}" else: NUM = "{:.1f}" for patch in u.patches: y_offset = patch.get_height() / 2 u.annotate(NUM.format(patch.get_width()), (patch.get_x() + patch.get_width(), patch.get_y() + patch.get_height() / 2), xytext=(2,0), textcoords='offset points', fontsize=7, va='center') # multiple columns with long labels cause the graph itself to get narrower # FIXME: Use a threshold for maximum label length. Set column number # so that the total width per row is <= 60 chars u.legend(frameon=False, loc='upper center', bbox_to_anchor=(0.5,-0.05), ncol=2) plt.tight_layout() u.figure.savefig(path, bbox_inches="tight") return True def purge_data(): query_zotero.clear_cache() print('Query cache purged') return def purge_images(): images = glob.glob(IMG_DIR + '/*') for i in images: os.remove(i) print('Image cache purged') return def purge(): """Clear the query cache and delete stored graph images.""" query_zotero.clear_cache() # print('Content-Type: text/plain\r\n') print('Query and image cache purged') return def wrap(data): """Wrap data labels for 'tag' index field.""" for d in data: d['tag'] = '\n'.join(textwrap.wrap(d['tag'], width=16)) return data def dataframe(p): """Generate a pandas dataframe.""" data = get_data(p) wrap(data) df = pd.DataFrame.from_records(data, index='tag') if p['sort'] or False: cols = df.columns.values.tolist() df = df.sort_values(by=cols) else: # This reorders the columns by manually supplied order # FIXME: Move the strip function so we don't do it twice df = df.reindex(columns=[_strip(x) for x in p['tags_x']]) if p['transpose']: df = df.T return df def table(p): df = dataframe(p) html = df.to_html(classes='collapse ba br2 b--black-10 pv2 ph3 mt4 center') return html.replace('\\n', ' ') def print_json(p): """Print a json representation of the raw query response data.""" data = get_data(p) print('Content-Type: application/json\r\n') print(json.dumps(data, indent=4, sort_keys=True)) return def print_html(content, new_query=False): """Print content as an html document with matching HTTP header.""" print(HTML_HEADER) print(content) if new_query: print(QUERY_BUTTON.format(new_query)) print(HTML_FOOTER) return def run(): p = parse_query() q = editable_query(p) e = urllib.parse.urlencode(q) if p['purge']: return purge() if p['purge_data']: return purge_data() if p['purge_images']: return purge_images() if p['format'] == 'json': return print_json(p) if p['format'] == 'table': return print_html(table(p), new_query=e) if p['format'] == 'image': if p.get('subplots', False) and isinstance(p['tags_x'], list): # manually generate each graph; keep same sort order p['sort'] = False graphs = [] tags_x = p['tags_x'].copy() for x in tags_x: p['tags_x'] = [x] graphs.append(graph(p)) content = ''.join(graphs) else: content = graph(p) return print_html(content, new_query=e) return print_html(FORM.format(**q)) if __name__ == '__main__': print('Content-Type: text/html\r\n') print('\r\n') run()