From 4c9349b52f7296fd7e6f3691f30d2ce29a0ed67c Mon Sep 17 00:00:00 2001 From: mromanello Date: Sat, 9 May 2015 13:24:20 +0200 Subject: [PATCH] committing latest changes --- citation_extractor/__init__.py | 2 +- citation_extractor/core.py | 231 ++++++++++++------ citation_extractor/eval.py | 113 ++++----- citation_extractor/process.py | 202 +++++++++++++-- citation_extractor/settings/__init__.pyc | Bin 126 -> 107 bytes citation_extractor/settings/base_settings.pyc | Bin 1442 -> 1253 bytes 6 files changed, 385 insertions(+), 163 deletions(-) diff --git a/citation_extractor/__init__.py b/citation_extractor/__init__.py index 997a77d1..cc30b042 100755 --- a/citation_extractor/__init__.py +++ b/citation_extractor/__init__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -__version__='1.3.1' +__version__='1.3.4' diff --git a/citation_extractor/core.py b/citation_extractor/core.py index 69c0427b..0208e570 100755 --- a/citation_extractor/core.py +++ b/citation_extractor/core.py @@ -90,60 +90,158 @@ def classify(self,feature_sets): tagged_tokens_list = instance_to_string(feature_sets) return self.crf_model.classify(tagged_tokens_list) -class SVM_Classifier: +class ScikitClassifierAdapter: """ - Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object - to make sure that all classifiers take same input and return the same output. + An adapter for an SklearnClassifier (nltk.classify.scikitlearn) object + to make sure that all classifiers take same input and return the same output + and are trained in the same way. + + scikit_classifier: + a Scikit classifier *instance* + + train_file_name: + the path to the training settings + + template_file_name: + the template to extract additional feature for optimization purposes + """ - def __init__(self, train_file_name): - from sklearn.svm import LinearSVC + def __init__(self, scikit_classifier, train_file_name,template_file_name,labelled_feature_sets=None): from nltk.classify.scikitlearn import SklearnClassifier + from sklearn.ensemble import AdaBoostClassifier + from sklearn.naive_bayes import GaussianNB + from sklearn.ensemble import RandomForestClassifier fe = FeatureExtractor() - self.classifier = SklearnClassifier(LinearSVC(),sparse=False) + #self.classifier = SklearnClassifier(scikit_classifier,sparse=False) + if(isinstance(scikit_classifier,RandomForestClassifier)): + self.classifier = SklearnClassifier(scikit_classifier,sparse=False) + elif(isinstance(scikit_classifier,GaussianNB)): + self.classifier = SklearnClassifier(scikit_classifier,sparse=False) + else: + self.classifier = SklearnClassifier(scikit_classifier) + self.compiled_templates = self.process_template(template_file_name) feature_sets = [] - iob_data = file_to_instances(train_file_name) - print "instances ",len(iob_data) - print "tokens",count_tokens(iob_data) - for n,instance in enumerate(iob_data[:10]): - sentence_n = n - pos_tags = [('z_POS',token[1]) for token in instance] - labels = [token[2] for token in instance] - tokens = [token[0] for token in instance] - for n,token in enumerate(tokens): - dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0] - # this has to be removed when training the CRF model (!) - dict_features["q:id"]=sentence_n - feature_sets.append([dict_features, labels[n]]) - #pprint(feature_sets) - print len(feature_sets) - self.classifier.train(feature_sets) + if(labelled_feature_sets is not None): + feature_sets = labelled_feature_sets + logger.info("using a pre-computed feature_sets containing %i instances"%len(feature_sets)) + else: + iob_data = file_to_instances(train_file_name) + logger.info("instances ",len(iob_data)) + logger.info("tokens",count_tokens(iob_data)) + for n,instance in enumerate(iob_data): + sentence_n = n + pos_tags = [('z_POS',token[1]) for token in instance] + labels = [token[2] for token in instance] + tokens = [token[0] for token in instance] + for n,token in enumerate(tokens): + dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0] + feature_sets.append([dict_features, labels[n]]) + self.classifier.train(self.apply_feature_template(feature_sets,out_label=True)) return def classify(self,feature_sets): - return self.classifier.classify_many(feature_sets) + """ + Args: + feature_sets: + a list of dictionaries like the following: -class NaiveBayes_Classifier: - """ - Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object - to make sure that all classifiers take same input and return the same output. - """ - def __init__(): - pass + [{'a_token': u'Nella', + 'b_punct': 'OTHERS', + 'c_brackets': 'OTHERS', + 'd_case': 'INIT_CAPS', + 'e_number': 'NO_DIGITS', + 'f_ngram_1': u'N', + 'f_ngram_2': u'Ne', + 'f_ngram_3': u'Nel', + 'f_ngram_4': u'Nell', + 'g_ngram_1': u'a', + 'g_ngram_2': u'la', + 'g_ngram_3': u'lla', + 'g_ngram_4': u'ella', + 'h_lowcase': u'nella', + 'i_str-length': '5', + 'l_pattern': 'Aaaaa', + 'm_compressed-pattern': 'Aa', + 'n_works_dictionary': 'OTHERS', + 'z': '_'}, ... ] - def classify(): - pass + Returns: + result: + a list of dictionaries, where each dictionary corresponds + to a token, -class RandomForest_Classifier: - """ - Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object - to make sure that all classifiers take same input and return the same output. - """ - def __init__(): - pass + [{'features': [], + 'id': 37, + 'label': 'O', + 'probs': {'B-AAUTHOR': + {'alpha': '234.113833', + 'beta': '-2.125040', + 'prob': '0.000262'}, + }, + 'token': '.'},...] + """ + # apply feature templates (from CRF++) + template_feature_sets = self.apply_feature_template(feature_sets,out_label=False) + # keep the output labels + output_labels = self.classifier.classify_many(template_feature_sets) + result = [] + for n,feature_set in enumerate(feature_sets): + temp = {} + temp["token"]=feature_set["a_token"].encode('utf-8') + temp["label"]=str(output_labels[n]) + result.append(temp) + return result + def process_template(self,template_file): + """ - def classify(): - pass + Example of the output: + + [('U01:%s', [(-2, 0)]), + ('U02:%s', [(-1, 0)]),...] -def chain_IOB_files(directories,output_fname): + """ + f = open(template_file,'r') + lines = [line.replace('\n','') for line in f.readlines() if not line.startswith('\n') and not line.startswith('#') and not line.startswith('B')] + f.close() + import re + exp = re.compile("%x\[(-?\d+),(-?\d+)\]") + result = [] + for line in lines: + result.append((exp.sub('%s',line),[(int(match[0]),int(match[1])) for match in exp.findall(line)])) + return result + def apply_feature_template(self,feature_sets,out_label=False): + """ + + TODO: apply each of the compiled templates + + """ + def get_value(feature_sets,token_n,feature_n): + if(token_n < 0): + return "ND" + elif(token_n > (len(feature_sets)-1)): + return "ND" + else: + return feature_sets[token_n][feature_n] + if(out_label): + unlabelled_feature_sets = [[f[0][key] for key in sorted(f[0])] for f in feature_sets] + else: + unlabelled_feature_sets = [[f[key] for key in sorted(f)] for f in feature_sets] + assert len(feature_sets) == len(unlabelled_feature_sets) + new_features = [] + for n,fs in enumerate(unlabelled_feature_sets): + result = {} + for template,replacements in self.compiled_templates: + template_name = template.split(":")[0] + template = template.split(":")[1] + values = [get_value(unlabelled_feature_sets,n+r[0],r[1]) for r in replacements] + result[template_name] = template%tuple(values) + if(out_label): + # keep the expected label for training + new_features.append([result,feature_sets[n][1]]) + else: + new_features.append(result) + return new_features + +def chain_IOB_files(directories,output_fname,extension=".iob"): import glob import codecs all_in_one = [] @@ -152,7 +250,7 @@ def chain_IOB_files(directories,output_fname): # concatenate their content with line return # write to a new file logger.debug("Processing %s"%dir) - for infile in glob.glob( os.path.join(dir, '*.iob') ): + for infile in glob.glob( os.path.join(dir, '*%s'%extension) ): logger.debug("Found the file %s"%infile) file_content = codecs.open("%s"%(infile), 'r',encoding="utf-8").read() all_in_one.append(file_content) @@ -184,53 +282,28 @@ class citation_extractor: And finally we classify the test instances >>> result = extractor.extract(instances, postags) + + As deafult, a CRF model is used. However, when initialising the `citation_extractor` you can + pass on to it any scikit classifier, e.g. RandomForest: + + >>> from sklearn.ensemble import RandomForestClassifier + >>> extractor = citation_extractor(base_settings,RandomForestClassifier()) + """ - def __init__(self,options,classifier_type): + def __init__(self,options,classifier=None,labelled_feature_sets=None): self.classifier=None - logfile = "" - if(options.DEBUG): - self.init_logger(loglevel=logging.DEBUG, log_file=options.LOG_FILE) - else: - self.init_logger(loglevel=logging.INFO, log_file=options.LOG_FILE) self.fe = FeatureExtractor() if(options.DATA_FILE != ""): allinone_iob_file = options.DATA_FILE elif(options.DATA_DIRS != ""): - chain_IOB_files(options.DATA_DIRS,"%sall_in_one.iob"%options.TEMP_DIR) + chain_IOB_files(options.DATA_DIRS,"%sall_in_one.iob"%options.TEMP_DIR,".txt") allinone_iob_file = "%sall_in_one.iob"%options.TEMP_DIR # initialise the classifier - if(classifier_type == "crf"): + if(classifier is None): self.classifier=CRFPP_Classifier(allinone_iob_file,"%s%s"%(options.CRFPP_TEMPLATE_DIR,options.CRFPP_TEMPLATE),options.TEMP_DIR) - elif(classifier_type == "svm"): - self.classifier = SVM_Classifier(allinone_iob_file) - elif(classifier_type == "rf"): - self.classifier = RandomForest_Classifier(allinone_iob_file) - elif(classifier_type == "nb"): - self.classifier = NaiveBayes_Classifier(allinone_iob_file) - else: - pass - - def init_logger(self,log_file=None, loglevel=logging.DEBUG): - """ - Initialise the logger - """ - if(log_file !="" or log_file is not None): - logging.basicConfig( - filename=log_file - ,level=loglevel,format='%(asctime)s - %(name)s - [%(levelname)s] %(message)s',filemode='w',datefmt='%a, %d %b %Y %H:%M:%S' - ) - logger = logging.getLogger('CREX') - logger.info("Logger initialised") else: - logger = logging.getLogger('CREX') - logger.setLevel(loglevel) - ch = logging.StreamHandler() - ch.setLevel(loglevel) - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - ch.setFormatter(formatter) - logger.addHandler(ch) - logger.info("Logger initialised") + self.classifier = ScikitClassifierAdapter(classifier,allinone_iob_file,"%s%s"%(options.CRFPP_TEMPLATE_DIR,options.CRFPP_TEMPLATE),labelled_feature_sets) def output(self,result,outp=None): """ diff --git a/citation_extractor/eval.py b/citation_extractor/eval.py index 6b115bd7..74868b3a 100755 --- a/citation_extractor/eval.py +++ b/citation_extractor/eval.py @@ -88,7 +88,10 @@ def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1): self.logger = logging.getLogger("CREX.SIMPLEVAL") if(iob_file is None): self.logger.debug(iob_directories) - self.test_instances = self.read_instances(iob_directories) + data = [] + for directory in iob_directories: + data += IO.read_iob_files(directory,".txt") + self.test_instances = data else: self.test_instances = IO.file_to_instances(iob_file) self.logger.debug("Found %i instances for test"%len(self.test_instances)) @@ -106,13 +109,15 @@ def eval(self): TODO """ extractor_results = {} - for eng in self.extractors: - input = [[[token[0] for token in instance] for instance in self.test_instances]] + for extractor in self.extractors: + eng = extractor[1] + extractor_name = extractor[0] + input = [[token[0] for token in instance] for instance in self.test_instances if len(instance)>0] POS = False if(len(self.test_instances[0][0]) > 2): self.label_index = 2 # the last one is the label - legacy_features = [[("z_POS",token[1]) for token in instance] for instance in self.test_instances] - output = eng.extract(input,[legacy_features]) + legacy_features = [[("z_POS",token[1]) for token in instance] for instance in self.test_instances if len(instance)>0] + output = eng.extract(input,legacy_features) POS = True else: output = eng.extract(input) @@ -128,7 +133,7 @@ def eval(self): eval_results["precision"] = self.calc_precision(eval_results) eval_results["recall"] = self.calc_recall(eval_results) by_tag_results = self.calc_stats_by_tag(by_tag_results) - extractor_results[str(eng)] = results + extractor_results[extractor_name] = results return extractor_results @staticmethod @@ -250,27 +255,27 @@ def evaluate(l_tagged_instances,l_test_instances,negative_BIO_tag = u'O',label_i if(tagged_label == gold_label): p_tp += 1 errors_by_tag[gold_label]["true_pos"] += 1 - l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "TP")) + l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("TP",tagged_token, tagged_label, gold_label)) elif(tagged_label != gold_label): if(tagged_label == negative_BIO_tag): p_fn += 1 errors_by_tag[gold_label]["false_neg"] += 1 - l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FN")) + l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FN",tagged_token, tagged_label, gold_label)) else: p_fp += 1 errors_by_tag[gold_label]["false_pos"] += p_fp - l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FP")) + l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FP",tagged_token, tagged_label, gold_label)) elif(gold_label == negative_BIO_tag): l_logger.debug("Label \"%s\" for token \"%s\" is negative"%(gold_label,gold_token)) if(tagged_label == gold_label): p_tn += 1 errors_by_tag[gold_label]["true_pos"] += 1 - l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "TN")) + l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("TN",tagged_token, tagged_label, gold_label)) elif(tagged_label != gold_label): if(tagged_label != negative_BIO_tag): p_fp += 1 errors_by_tag[gold_label]["false_pos"] += 1 - l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FP")) + l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FP",tagged_token, tagged_label, gold_label)) fp += p_fp tp += p_tp fn += p_fn @@ -408,8 +413,6 @@ def calc_fscore(d_errors): return 0 else: return 2*(float(prec * rec) / float(prec + rec)) - - class CrossEvaluator(SimpleEvaluator): """ >>> import settings #doctest: +SKIP @@ -442,7 +445,7 @@ def __init__(self,extractors,iob_test_file,culling_size=None,fold_number=10,eval def create_datasets(self): """ - docstring for create_datasets + TODO """ from miguno.partitioner import * @@ -465,22 +468,11 @@ def create_datasets(self): def run(self): """ - docstring for run - - TODO: - for each iteration - for each engine (extractor) - write to file the train set - write to file the test set - evaluate - append to - results[extractors[str(extractor_1)]][round-n][fscore] - results[extractors[str(extractor_1)]][round-n][prec] - results[extractors[str(extractor_1)]][round-n][...] - + TODO """ iterations = [] results = {} + results_by_entity = {} # first lets' create test and train set for each iteration for x,iter in enumerate(self.dataSets_iterator): self.logger.info("Iteration %i"%(x+1)) @@ -497,38 +489,47 @@ def run(self): # let's go through all the iterations for i,iter in enumerate(iterations): results["iter-%i"%(i+1)] = {} - for n,extractor_settings in enumerate(self.extractors): - results["iter-%i"%(i+1)]["extractor-%i"%(n+1)] = {} - self.logger.info("Running iteration #%i with extractor #%i"%(i+1,n+1)) - self.logger.info(extractor_settings) - train_file="%sfold_%i.train"%(self.evaluation_dir,i+1) - test_file="%sfold_%i.test"%(self.evaluation_dir,i+1) + results_by_entity["iter-%i"%(i+1)] = {} + train_file="%sfold_%i.train"%(self.evaluation_dir,i+1) + test_file="%sfold_%i.test"%(self.evaluation_dir,i+1) + IO.write_iob_file(iter[0],train_file) + IO.write_iob_file(iter[1],test_file) + # the following line is a bit of a workaround + # to avoid recomputing the features when training + # each new classifier, I take them from the file created + # to train the CRF model (which should always be the first extractor + # to be evaluated). + filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1)) + f=codecs.open(filename,'r','utf-8') + data = f.read() + f.close() + feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')] + order = FeatureExtractor().get_feature_order() + labelled_feature_sets=[] + for instance in feature_sets: + for token in instance: + temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]] + labelled_feature_sets.append(temp) + self.logger.info("read %i labelled instances"%len(feature_sets)) + for n,extractor in enumerate(self.extractors): + extractor_settings = extractor[1] + extractor_name = extractor[0] + results["iter-%i"%(i+1)][extractor_name] = {} + self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name)) self.logger.info(train_file) self.logger.info(test_file) - import codecs - file = codecs.open(train_file,'w','utf-8') - if(len(iter[0][0][0])==2): - tmp = [[("%s\t%s"%(token[0],token[1]))for token in instance] for instance in iter[0]] - else: - tmp = [[("%s\t%s\t%s"%(token[0],token[1],token[2]))for token in instance] for instance in iter[0]] - tmp = ["\n".join(x) for x in tmp] - to_write = "\n\n".join(tmp) - file.write(to_write) - file.close() - file = codecs.open(test_file,'w','utf-8') - if(len(iter[0][0][0])==2): - tmp = [[("%s\t%s"%(token[0],token[1]))for token in instance] for instance in iter[1]] - else: - tmp = [[("%s\t%s\t%s"%(token[0],token[1],token[2]))for token in instance] for instance in iter[1]] - tmp = ["\n".join(x) for x in tmp] - to_write = "\n\n".join(tmp) - file.write(to_write) - file.close() + self.logger.info(extractor_settings) extractor_settings.DATA_FILE = train_file - extractor = citation_extractor(extractor_settings) - se = SimpleEvaluator([extractor,],iob_file=test_file) - results["iter-%i"%(i+1)]["extractor-%i"%(n+1)] = se.eval()[str(extractor)] - return results + if(extractor_settings.CLASSIFIER is not None): + extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets) + else: + extractor = citation_extractor(extractor_settings) + self.logger.info(extractor.classifier) + se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file) + results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0] + results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1]) + #self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name]) + return results,results_by_entity if __name__ == "__main__": #Usage example: python eval.py aph_data_100_positive/ out/ diff --git a/citation_extractor/process.py b/citation_extractor/process.py index 796b2942..368b1f1c 100644 --- a/citation_extractor/process.py +++ b/citation_extractor/process.py @@ -1,4 +1,58 @@ +# -*- coding: utf-8 -*- +# author: Matteo Romanello, matteo.romanello@gmail.com + import sys +import logging + +global logger +logger = logging.getLogger() + +def recover_segmentation_errors(text,abbreviation_list,verbose=False): + """ + + Pretty straightforward heuristic here: + if a line of text contains one token, which matches against a list of abbreviations + assume that after this token there shouldn't be a sentence break; the same for + the last token of a line consisting of more than one token. + + >> import codecs + >> abbrev_file = "data/abbreviations_all_in_one.txt" + >> abbrev = codecs.open(abbrev_file).read().split('\n') + >> text_file = 'data/txt/ocr_10.2307_40231021.txt' + >> text = codecs.open(text_file,'r','utf-8').read() + >> recover_segmentation_errors(text,abbrev,verbose=True) + """ + def is_abbreviation(token,abbreviations): + return token in abbreviations + output = [] + text_lines = text.split('\n') + if(verbose): + print >> sys.stderr, "Input text has %i lines"%len(text_lines) + for line in text_lines: + tokens=line.split() + if(len(tokens)==1): + output+=tokens + if(not is_abbreviation(tokens[0],abbreviation_list)): + output.append('\n') + else: + if(verbose): + print >> sys.stderr,"%s is an abbreviation"%tokens[0] + else: + output+=tokens + try: + last_token = tokens[len(tokens)-1] + if(not is_abbreviation(last_token,abbreviation_list)): + output.append('\n') + else: + if(verbose): + print >> sys.stderr,"%s is an abbreviation"%last_token + except Exception, e: + pass + output_text = " ".join(output) + if(verbose): + print >> sys.stderr, "Output text has %i lines"%len(output_text.split('\n')) + print >> sys.stderr, "%i line were breaks recovered"%(len(text_lines)-len(output_text.split('\n'))) + return output_text def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=None): """docstring for create_taggers""" @@ -6,7 +60,7 @@ def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=Non import os os.environ["TREETAGGER"]=treetagger_dir lang_codes = { - 'en':('english','latin-1'), + 'en':('english','utf8'), 'it':('italian','utf8'), 'es':('spanish','utf8'), 'de':('german','utf8'), @@ -20,6 +74,27 @@ def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=Non raise e return taggers +def get_extractor(settings): + """ + Instantiate, train and return a Citation_Extractor. + """ + import sys + import citation_extractor as citation_extractor_module + from citation_extractor.core import citation_extractor + from citation_extractor.eval import IO + ce = None + try: + logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__) + train_instances = [] + for directory in settings.DATA_DIRS: + train_instances += IO.read_iob_files(directory,extension=".txt") + logger.info("Training data: found %i directories containing %i sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances))) + ce = citation_extractor(settings) + except Exception, e: + print e + finally: + return ce + def detect_language(text): """ Detect language of a notice by using the module guess_language. @@ -51,6 +126,36 @@ def create_instance_tokenizer(train_dirs=[("/Users/56k/phd/code/APh/corpus/txt/" train_text += [codecs.open(file,'r','utf-8').read() for file in glob.glob( os.path.join(dir[0], '*%s'%dir[1]))] return PunktSentenceTokenizer(sep.join(train_text)) +def compact_abbreviations(abbreviation_dir): + """ + process several files with abbreviations + chain them together and write them to a file + """ + fname = "%s%s"%(abbreviation_dir,"kb_abbrevs.txt") + import codecs + f = codecs.open(fname,'w','utf-8') + abbrvs = get_abbreviations_from_knowledge_base() + f.write("\n".join(abbrvs)) + f.close() + abbreviations = [] + files = [ + fname + ,"/Applications/TextPro1.5.2/SentencePro/bin/dict/ita/abbreviations.txt" + ,"/Applications/TextPro1.5.2/SentencePro/bin/dict/eng/abbreviations.txt" + ,"/Applications/TextPro1.5.2/SentencePro/bin/dict/ita/no_split_abbreviations.txt" + ,"/Applications/TextPro1.5.2/SentencePro/bin/dict/eng/no_split_abbreviations.txt" + ] + for fn in files: + f = codecs.open(fn,'r','utf-8') + print >> sys.stderr, "getting abbreviations from %s"%fn + abbreviations = abbreviations + [line for line in f.readlines() if not line.startswith("#") and line !=""] + abbreviations = sorted(list(set(abbreviations))) + fname = "%s%s"%(abbreviation_dir,"abbreviations_all_in_one.txt") + f = codecs.open(fname,'w','utf-8') + f.write("".join(abbreviations)) + f.close() + return fname,abbreviations + def split_sentences(filename,outfilename=None): """ sentence tokenization @@ -392,8 +497,10 @@ def tokenize(sentences,taggers, outfilename=None): tok_lang = lang if(tok_lang in ["en*","en**"]): tok_lang = "en" - tmp = [result[:2] for result in taggers[tok_lang].tag(sent)] - #print >> sys.stderr,"Tokenized sentence %i / %i"%(n,len(sentences)) + try: + tmp = [result[:2] for result in taggers[tok_lang].tag(sent)] + except Exception, e: + print >> sys.stderr,e iob.append(tmp) return lang,iob @@ -415,7 +522,7 @@ def preprocess(filename,taggers, outputdir, outfilename=None,split_sentence=Fals if(split_sentence): sentences = split_sentences(filename) else: - sentences = [text.replace("\n"," ")] + sentences = text.split('\n') print >> sys.stderr, "Text was split into %i sentences"%len(sentences) # tokenize lang, iob = tokenize(sentences,taggers) @@ -440,7 +547,6 @@ def save_scope_annotations(fileid, ann_dir, annotations): t[1] is the label (it doesn't get written to the file) t[2] is the URN, i.e. the content of the annotation if t[2] is None the annotation is skipped - """ ann_file = "%s%s-doc-1.ann"%(ann_dir,fileid) file_content = open(ann_file,'r').read() @@ -468,60 +574,102 @@ def tostandoff(iobfile,standoffdir,brat_script): except Exception, e: raise e -def disambiguate_relations(citation_matcher, relations,entities,docid): +def disambiguate_relations(citation_matcher,relations,entities,docid,fuzzy=False,distance_threshold=3,fill_nomatch_with_bogus_urn=False): """ - - TODO - Returns: - (u'R5', u'[ Verg. ] catal. 47s', u'urn:cts:TODO:47s') + [(u'R5', u'[ Verg. ] catal. 47s', u'urn:cts:TODO:47s')] """ import re - print >> sys.stderr, "Disambiguating the %i relation contained in %s..."%(len(relations), docid) result = [] for relation in relations: relation_type = relations[relation][0] arg1 = relations[relation][1].split(":")[1] arg2 = relations[relation][2].split(":")[1] - refauwo=entities[arg1][1] - refauwo=re.sub("[\(, \)]","",refauwo) # TODO move this to CitationParser + citation_string=entities[arg1][1] scope = entities[arg2][1] - scope = re.sub("\.$","",scope) - scope = re.sub("\,$","",scope) - scope = re.sub("[\(, \)]","",scope) + regex_clean_citstring = r'(« )|( »)|\(|\)|\,' + regex_clean_scope = r'(\(|\)| ?\;$|\.$|\,$)' + citation_string_cleaned = re.sub(regex_clean_citstring,"",citation_string) + scope_cleaned = re.sub(regex_clean_scope,"",scope) + print >> sys.stderr, "Citation_string cleaning: from \'%s\' to \'%s\'"%(citation_string,citation_string_cleaned) + print >> sys.stderr, "Scope cleaning: from \'%s\' to \'%s\'"%(scope,scope_cleaned) + citation_string = citation_string_cleaned + scope = scope_cleaned try: - urn = citation_matcher.disambiguate(refauwo,scope)[0] - result.append((relation,"%s %s"%(refauwo,scope),urn)) + urn = citation_matcher.disambiguate(citation_string,scope,fuzzy=fuzzy,distance_threshold=distance_threshold,cleanup=True)[0] + result.append((relation,"%s %s"%(citation_string,scope),urn)) except Exception, e: normalized_scope = scope try: normalized_scope = citation_matcher._citation_parser.parse(scope) normalized_scope = citation_matcher._format_scope(normalized_scope[0]['scp']) except Exception, e: - print >> sys.stderr, e - result.append((relation,"%s %s"%(refauwo,scope),None)) + print e + if(fill_nomatch_with_bogus_urn): + result.append((relation,"%s %s"%(citation_string,scope),"urn:cts:TODO:%s"%normalized_scope)) return result - def disambiguate_entities(citation_matcher,entities,docid,min_distance_threshold,max_distance_threshold): + """ + + When no match is found it's better not to fill with a bogus URN. The + reason is that in some cases it's perfectly ok that no match is found. An entity + can be valid entity also without having disambiguation information in the groundtruth. + + """ + def longestSubstringFinder(string1, string2): + """ + solution taken from http://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings + """ + answer = "" + len1, len2 = len(string1), len(string2) + for i in range(len1): + match = "" + for j in range(len2): + if (i + j < len1 and string1[i + j] == string2[j]): + match += string2[j] + else: + if (len(match) > len(answer)): answer = match + match = "" + return answer + import re + from operator import itemgetter print >> sys.stderr, "Disambiguating the %i entities contained in %s..."%(len(entities), docid) result = [] + matches = [] distance_threshold = min_distance_threshold + regex_clean_string = r'(« )|( »)|\(|\)|\,' for entity in entities: entity_type = entities[entity][0] + string = entities[entity][1].encode("utf-8") + cleaned_string = re.sub(regex_clean_string,"",string) + print >> sys.stderr, "String cleaning: from \'%s\' to \'%s\'"%(string,cleaned_string) + string = cleaned_string if entity_type == "AAUTHOR": - string = entities[entity][1] matches = citation_matcher.matches_author(string,True,distance_threshold) while(matches is None and distance_threshold <= max_distance_threshold): distance_threshold+=1 matches = citation_matcher.matches_author(string,True,distance_threshold) - if(matches is not None): - result.append((entity, string ,matches[0][0])) elif(entity_type == "AWORK"): - string = entities[entity][1] matches = citation_matcher.matches_work(string,True,distance_threshold) while(matches is None and distance_threshold <= max_distance_threshold): distance_threshold+=1 matches = citation_matcher.matches_work(string,True,distance_threshold) - if(matches is not None): - result.append((entity, string ,matches[0][0])) + if(matches is not None and (entity_type == "AAUTHOR" or entity_type == "AWORK")): + lowest_score = 1000 + for match in matches: + score = match[2] + if(score < lowest_score): + lowest_score = score + filtered_matches = [match for match in matches if match[2]==lowest_score] + filtered_matches = sorted(filtered_matches, key =itemgetter(2)) + best_match = ("",None) + if(lowest_score > 0): + for match in filtered_matches: + lcs = longestSubstringFinder(match[1],string) + if(len(lcs)>len(best_match[0])): + best_match = (lcs,match) + if(best_match[1] is not None): + result.append((entity,string,best_match[1][0])) + else: + result.append((entity, string ,filtered_matches[0][0])) return result \ No newline at end of file diff --git a/citation_extractor/settings/__init__.pyc b/citation_extractor/settings/__init__.pyc index 84447dc35907440a5f5368f6f917a0d816fde85e..e7e81df166c435b6e71bc3edc854aadd684b999a 100644 GIT binary patch delta 21 ccmb=+W@rA)%XRViwTbLzEFugH3=>1N08lgrVE_OC delta 40 vcmd0CRnhIld6~-tk>Vejnper#!Q36z9imt*GU4