Source code for scripts.analyze_annotations

#!/usr/bin/env python
"""``analyze_annotation.py`` is a script that analyzes annotation results.

For an overview of command-line options, call::

  analyze_annotation.py -h


Functionality
-------------

* Count symbols
* Count symbol classes
* Compute symbol parameters per class (size, morphological features..?) [NOT IMPLEMENTED]

* Count relationships
* Count relationship classes
* Compute relationship parameters per class pair

"""
import argparse
import collections
import logging
import pprint
import time

import operator

from typing import List, Tuple, Dict, Any

from mung.io import read_nodes_from_file, get_edges
from mung.node import merge_node_lists_from_multiple_documents, Node


[docs] def compute_node_statistics(nodes: List[Node], edges: List[Tuple[int, int]] = None) -> Dict[str, Any]: stats = collections.OrderedDict() # Count Nodes stats['number_of_nodes'] = len(nodes) # Count Nodes by class number_of_nodes_by_class = collections.defaultdict(int) for node in nodes: number_of_nodes_by_class[node.class_name] += 1 stats['number_of_nodes_by_class'] = number_of_nodes_by_class stats['number_of_distinct_nodes'] = len(number_of_nodes_by_class) if edges is not None: # Count relationships id_to_node_mapping = {node.id: node for node in nodes} stats['number_of_relationships'] = len(edges) number_of_relationships_by_class = collections.defaultdict(int) for edge in edges: from_node_id, to_node_id = edge from_node = id_to_node_mapping[from_node_id].class_name to_node = id_to_node_mapping[to_node_id].class_name number_of_relationships_by_class[(from_node, to_node)] += 1 stats['number_of_relationships_by_class'] = number_of_relationships_by_class stats['number_of_relationships_distinct'] = len(number_of_relationships_by_class) return stats
##############################################################################
[docs] def build_argument_parser(): parser = argparse.ArgumentParser(description=__doc__, add_help=True, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', action='store', nargs='+', required=True, help='List of input NodeList files.') parser.add_argument('-e', '--emit', action='store', default='print', choices=['print'], help='How should the analysis results be presented?') parser.add_argument('-v', '--verbose', action='store_true', help='Turn on INFO messages.') parser.add_argument('--debug', action='store_true', help='Turn on DEBUG messages.') return parser
if __name__ == '__main__': parser = build_argument_parser() args = parser.parse_args() if args.verbose: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) if args.debug: logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) logging.info('Starting main...') _start_time = time.clock() # Parse individual Node lists. node_lists = [] number_of_parsed_nodes = 0 for i, f in enumerate(args.input): node_list = read_nodes_from_file(f) node_lists.append(node_list) # Logging progress number_of_parsed_nodes += len(node_list) if i % 10 == 0 and i > 0: _time_parsing = time.clock() - _start_time nodes_per_second = number_of_parsed_nodes / _time_parsing logging.info('Parsed {0} Nodes in {1:.2f} s ({2:.2f} objs/s)' ''.format(number_of_parsed_nodes, _time_parsing, nodes_per_second)) # Merge the Node lists into one. # This is done so that the resulting object graph can be manipulated # at once, without id clashes. merged_node_list = merge_node_lists_from_multiple_documents(node_lists) edges = get_edges(merged_node_list) _parse_end_time = time.clock() logging.info('Parsing took {0:.2f} s'.format(_parse_end_time - _start_time)) ########################################################################## # Analysis # Here's where the results are stored, for export into various # formats. (Currently, we only print them.) statistics = compute_node_statistics(merged_node_list, edges=edges) ########################################################################## # Export if args.emit == 'print': print_statistics(statistics) _end_time = time.clock() logging.info('analyze_annotations.py done in {0:.3f} s' ''.format(_end_time - _start_time))