From 4c9349b52f7296fd7e6f3691f30d2ce29a0ed67c Mon Sep 17 00:00:00 2001
From: mromanello <matteo.romanello@gmail.com>
Date: Sat, 9 May 2015 13:24:20 +0200
Subject: [PATCH] committing latest changes

---
 citation_extractor/__init__.py                |   2 +-
 citation_extractor/core.py                    | 231 ++++++++++++------
 citation_extractor/eval.py                    | 113 ++++-----
 citation_extractor/process.py                 | 202 +++++++++++++--
 citation_extractor/settings/__init__.pyc      | Bin 126 -> 107 bytes
 citation_extractor/settings/base_settings.pyc | Bin 1442 -> 1253 bytes
 6 files changed, 385 insertions(+), 163 deletions(-)

diff --git a/citation_extractor/__init__.py b/citation_extractor/__init__.py
index 997a77d1..cc30b042 100755
--- a/citation_extractor/__init__.py
+++ b/citation_extractor/__init__.py
@@ -1,2 +1,2 @@
 # -*- coding: utf-8 -*-
-__version__='1.3.1'
+__version__='1.3.4'
diff --git a/citation_extractor/core.py b/citation_extractor/core.py
index 69c0427b..0208e570 100755
--- a/citation_extractor/core.py
+++ b/citation_extractor/core.py
@@ -90,60 +90,158 @@ def classify(self,feature_sets):
 		tagged_tokens_list = instance_to_string(feature_sets)
 		return self.crf_model.classify(tagged_tokens_list)
 
-class SVM_Classifier:
+class ScikitClassifierAdapter:
 	"""
-	Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object
-	to make sure that all classifiers take same input and return the same output. 
+	An adapter for an SklearnClassifier (nltk.classify.scikitlearn) object
+	to make sure that all classifiers take same input and return the same output
+	and are trained in the same way.
+
+	scikit_classifier:
+		a Scikit classifier *instance*
+
+	train_file_name:
+		the path to the training settings
+
+	template_file_name:
+		the template to extract additional feature for optimization purposes
+
 	"""
-	def __init__(self, train_file_name):
-		from sklearn.svm import LinearSVC
+	def __init__(self, scikit_classifier, train_file_name,template_file_name,labelled_feature_sets=None):
 		from nltk.classify.scikitlearn import SklearnClassifier
+		from sklearn.ensemble import AdaBoostClassifier
+		from sklearn.naive_bayes import GaussianNB
+		from sklearn.ensemble import RandomForestClassifier
 		fe = FeatureExtractor()
-		self.classifier = SklearnClassifier(LinearSVC(),sparse=False)
+		#self.classifier = SklearnClassifier(scikit_classifier,sparse=False)
+		if(isinstance(scikit_classifier,RandomForestClassifier)):
+			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
+		elif(isinstance(scikit_classifier,GaussianNB)):
+			self.classifier = SklearnClassifier(scikit_classifier,sparse=False) 
+		else:
+			self.classifier = SklearnClassifier(scikit_classifier)
+		self.compiled_templates = self.process_template(template_file_name)
 		feature_sets = []
-		iob_data = 	file_to_instances(train_file_name)
-		print "instances ",len(iob_data)
-		print "tokens",count_tokens(iob_data)
-		for n,instance in enumerate(iob_data[:10]):
-		    sentence_n = n
-		    pos_tags = [('z_POS',token[1]) for token in instance]
-		    labels = [token[2] for token in instance]
-		    tokens = [token[0] for token in instance]
-		    for n,token in enumerate(tokens):
-		        dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0]
-		        # this has to be removed when training the CRF model (!)
-		        dict_features["q:id"]=sentence_n
-		        feature_sets.append([dict_features, labels[n]])
-		    #pprint(feature_sets)
-		print len(feature_sets)
-		self.classifier.train(feature_sets)
+		if(labelled_feature_sets is not None):
+			feature_sets = labelled_feature_sets
+			logger.info("using a pre-computed feature_sets containing %i instances"%len(feature_sets))
+		else:
+			iob_data = 	file_to_instances(train_file_name)
+			logger.info("instances ",len(iob_data))
+			logger.info("tokens",count_tokens(iob_data))
+			for n,instance in enumerate(iob_data):
+			    sentence_n = n
+			    pos_tags = [('z_POS',token[1]) for token in instance]
+			    labels = [token[2] for token in instance]
+			    tokens = [token[0] for token in instance]
+			    for n,token in enumerate(tokens):
+			        dict_features = fe.get_features([token],labels=labels,outp_label=False,legacy_features=pos_tags)[0]
+			        feature_sets.append([dict_features, labels[n]])
+		self.classifier.train(self.apply_feature_template(feature_sets,out_label=True))
 		return
 	def classify(self,feature_sets):
-		return self.classifier.classify_many(feature_sets)
+		"""
+		Args:
+			feature_sets: 
+				a list of dictionaries like the following:
 
-class NaiveBayes_Classifier:
-	"""
-	Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object
-	to make sure that all classifiers take same input and return the same output.
-	"""
-	def __init__():
-		pass
+				[{'a_token': u'Nella',
+				 'b_punct': 'OTHERS',
+				 'c_brackets': 'OTHERS',
+				 'd_case': 'INIT_CAPS',
+				 'e_number': 'NO_DIGITS',
+				 'f_ngram_1': u'N',
+				 'f_ngram_2': u'Ne',
+				 'f_ngram_3': u'Nel',
+				 'f_ngram_4': u'Nell',
+				 'g_ngram_1': u'a',
+				 'g_ngram_2': u'la',
+				 'g_ngram_3': u'lla',
+				 'g_ngram_4': u'ella',
+				 'h_lowcase': u'nella',
+				 'i_str-length': '5',
+				 'l_pattern': 'Aaaaa',
+				 'm_compressed-pattern': 'Aa',
+				 'n_works_dictionary': 'OTHERS',
+				 'z': '_'}, ... ]
 
-	def classify():
-		pass
+		Returns:
+			result:
+				a list of dictionaries, where each dictionary corresponds
+				to a token,
 
-class RandomForest_Classifier:
-	"""
-	Just a wrapper around a an SklearnClassifier (nltk.classify.scikitlearn) object
-	to make sure that all classifiers take same input and return the same output.
-	"""
-	def __init__():
-		pass
+				[{'features': [],
+				 'id': 37,
+				 'label': 'O',
+				 'probs': {'B-AAUTHOR': 
+				 	{'alpha': '234.113833',
+				 	'beta': '-2.125040',
+				 	'prob': '0.000262'},
+				   },
+				 'token': '.'},...]
+		"""
+		# apply feature templates (from CRF++)
+		template_feature_sets = self.apply_feature_template(feature_sets,out_label=False)
+		# keep the output labels
+		output_labels = self.classifier.classify_many(template_feature_sets)
+		result = []
+		for n,feature_set in enumerate(feature_sets):
+			temp = {}
+			temp["token"]=feature_set["a_token"].encode('utf-8')
+			temp["label"]=str(output_labels[n])
+			result.append(temp)
+		return result
+	def process_template(self,template_file):
+		"""
 
-	def classify():
-		pass
+		Example of the output:
+
+		[('U01:%s', [(-2, 0)]),
+ 		('U02:%s', [(-1, 0)]),...]
 
-def chain_IOB_files(directories,output_fname):
+		"""
+		f = open(template_file,'r')
+		lines = [line.replace('\n','') for line in f.readlines() if not line.startswith('\n') and not line.startswith('#') and not line.startswith('B')]
+		f.close()
+		import re
+		exp = re.compile("%x\[(-?\d+),(-?\d+)\]")
+		result = []
+		for line in lines:
+			result.append((exp.sub('%s',line),[(int(match[0]),int(match[1])) for match in exp.findall(line)]))
+		return result
+	def apply_feature_template(self,feature_sets,out_label=False):
+		"""
+		
+		TODO: apply each of the compiled templates
+
+		"""
+		def get_value(feature_sets,token_n,feature_n):
+			if(token_n < 0):
+				return "ND"
+			elif(token_n > (len(feature_sets)-1)):
+				return "ND"
+			else:
+				return feature_sets[token_n][feature_n]
+		if(out_label):
+			unlabelled_feature_sets = [[f[0][key] for key in sorted(f[0])] for f in feature_sets]
+		else:
+			unlabelled_feature_sets = [[f[key] for key in sorted(f)] for f in feature_sets]
+		assert len(feature_sets) == len(unlabelled_feature_sets)
+		new_features = []
+		for n,fs in enumerate(unlabelled_feature_sets):
+			result = {}
+			for template,replacements in self.compiled_templates:
+				template_name = template.split(":")[0]
+				template = template.split(":")[1]
+				values = [get_value(unlabelled_feature_sets,n+r[0],r[1]) for r in replacements]
+				result[template_name] = template%tuple(values)
+			if(out_label):
+				# keep the expected label for training
+				new_features.append([result,feature_sets[n][1]])
+			else:
+				new_features.append(result)
+		return new_features
+
+def chain_IOB_files(directories,output_fname,extension=".iob"):
 	import glob
 	import codecs
 	all_in_one = []
@@ -152,7 +250,7 @@ def chain_IOB_files(directories,output_fname):
 		# concatenate their content with line return
 		# write to a new file
 		logger.debug("Processing %s"%dir)
-		for infile in glob.glob( os.path.join(dir, '*.iob') ):
+		for infile in glob.glob( os.path.join(dir, '*%s'%extension) ):
 			logger.debug("Found the file %s"%infile)
 			file_content = codecs.open("%s"%(infile), 'r',encoding="utf-8").read()
 			all_in_one.append(file_content)
@@ -184,53 +282,28 @@ class citation_extractor:
 	
 	And finally we classify the test instances
 	>>> result = extractor.extract(instances, postags)
+
+	As deafult, a CRF model is used. However, when initialising the `citation_extractor` you can 
+	pass on to it any scikit classifier, e.g. RandomForest:
+
+	>>> from sklearn.ensemble import RandomForestClassifier
+	>>> extractor = citation_extractor(base_settings,RandomForestClassifier())
+
 	"""
 
-	def __init__(self,options,classifier_type):
+	def __init__(self,options,classifier=None,labelled_feature_sets=None):
 		self.classifier=None
-		logfile = ""
-		if(options.DEBUG):
-			self.init_logger(loglevel=logging.DEBUG, log_file=options.LOG_FILE)
-		else:
-			self.init_logger(loglevel=logging.INFO, log_file=options.LOG_FILE)
 		self.fe = FeatureExtractor()
 		if(options.DATA_FILE != ""):
 			allinone_iob_file = options.DATA_FILE
 		elif(options.DATA_DIRS != ""):
-			chain_IOB_files(options.DATA_DIRS,"%sall_in_one.iob"%options.TEMP_DIR)
+			chain_IOB_files(options.DATA_DIRS,"%sall_in_one.iob"%options.TEMP_DIR,".txt")
 			allinone_iob_file = "%sall_in_one.iob"%options.TEMP_DIR
 		# initialise the classifier
-		if(classifier_type == "crf"):
+		if(classifier is None):
 			self.classifier=CRFPP_Classifier(allinone_iob_file,"%s%s"%(options.CRFPP_TEMPLATE_DIR,options.CRFPP_TEMPLATE),options.TEMP_DIR)
-		elif(classifier_type == "svm"):
-			self.classifier = SVM_Classifier(allinone_iob_file)
-		elif(classifier_type == "rf"):
-			self.classifier = RandomForest_Classifier(allinone_iob_file)
-		elif(classifier_type == "nb"):
-			self.classifier = NaiveBayes_Classifier(allinone_iob_file)
-		else:
-			pass
-	
-	def init_logger(self,log_file=None, loglevel=logging.DEBUG):
-		"""
-		Initialise the logger
-		"""
-		if(log_file !="" or log_file is not None):
-			logging.basicConfig(
-				filename=log_file
-				,level=loglevel,format='%(asctime)s - %(name)s - [%(levelname)s] %(message)s',filemode='w',datefmt='%a, %d %b %Y %H:%M:%S'
-			)
-			logger = logging.getLogger('CREX')
-			logger.info("Logger initialised")
 		else:
-			logger = logging.getLogger('CREX')
-			logger.setLevel(loglevel)
-			ch = logging.StreamHandler()
-			ch.setLevel(loglevel)
-			formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-			ch.setFormatter(formatter)
-			logger.addHandler(ch)
-			logger.info("Logger initialised")
+			self.classifier = ScikitClassifierAdapter(classifier,allinone_iob_file,"%s%s"%(options.CRFPP_TEMPLATE_DIR,options.CRFPP_TEMPLATE),labelled_feature_sets)
 	
 	def output(self,result,outp=None):
 		"""
diff --git a/citation_extractor/eval.py b/citation_extractor/eval.py
index 6b115bd7..74868b3a 100755
--- a/citation_extractor/eval.py
+++ b/citation_extractor/eval.py
@@ -88,7 +88,10 @@ def __init__(self,extractors,iob_directories=[],iob_file=None,label_index=-1):
 		self.logger = logging.getLogger("CREX.SIMPLEVAL")
 		if(iob_file is None):
 			self.logger.debug(iob_directories)
-			self.test_instances = self.read_instances(iob_directories)
+			data = []
+			for directory in iob_directories:
+				data += IO.read_iob_files(directory,".txt")
+			self.test_instances = data
 		else:
 			self.test_instances = IO.file_to_instances(iob_file)
 		self.logger.debug("Found %i instances for test"%len(self.test_instances))
@@ -106,13 +109,15 @@ def eval(self):
 			TODO
 		"""
 		extractor_results = {}
-		for eng in self.extractors:
-			input = [[[token[0] for token in instance] for instance in self.test_instances]]
+		for extractor in self.extractors:
+			eng = extractor[1]
+			extractor_name = extractor[0]
+			input = [[token[0] for token in instance] for instance in self.test_instances if len(instance)>0]
 			POS = False
 			if(len(self.test_instances[0][0]) > 2):
 				self.label_index = 2 # the last one is the label
-				legacy_features = [[("z_POS",token[1]) for token in instance] for instance in self.test_instances]
-				output = eng.extract(input,[legacy_features])
+				legacy_features = [[("z_POS",token[1]) for token in instance] for instance in self.test_instances if len(instance)>0]
+				output = eng.extract(input,legacy_features)
 				POS = True
 			else:
 				output = eng.extract(input)
@@ -128,7 +133,7 @@ def eval(self):
 			eval_results["precision"] = self.calc_precision(eval_results)
 			eval_results["recall"] = self.calc_recall(eval_results)
 			by_tag_results = self.calc_stats_by_tag(by_tag_results)
-			extractor_results[str(eng)] = results
+			extractor_results[extractor_name] = results
 		return extractor_results
 	
 	@staticmethod
@@ -250,27 +255,27 @@ def evaluate(l_tagged_instances,l_test_instances,negative_BIO_tag = u'O',label_i
 					if(tagged_label == gold_label):
 						p_tp += 1
 						errors_by_tag[gold_label]["true_pos"] += 1
-						l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "TP"))
+						l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("TP",tagged_token, tagged_label, gold_label))
 					elif(tagged_label != gold_label):
 						if(tagged_label == negative_BIO_tag):
 							p_fn += 1
 							errors_by_tag[gold_label]["false_neg"] += 1
-							l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FN"))
+							l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FN",tagged_token, tagged_label, gold_label))
 						else:
 							p_fp += 1
 							errors_by_tag[gold_label]["false_pos"] += p_fp
-							l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FP"))
+							l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FP",tagged_token, tagged_label, gold_label))
 				elif(gold_label == negative_BIO_tag):
 					l_logger.debug("Label \"%s\" for token \"%s\" is negative"%(gold_label,gold_token))	
 					if(tagged_label == gold_label):
 						p_tn += 1
 						errors_by_tag[gold_label]["true_pos"] += 1
-						l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "TN"))
+						l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("TN",tagged_token, tagged_label, gold_label))
 					elif(tagged_label != gold_label):
 						if(tagged_label != negative_BIO_tag):
 							p_fp += 1
 							errors_by_tag[gold_label]["false_pos"] += 1
-							l_logger.debug("\"%s\"=> tagged: %s / gold: %s [%s]"%(tagged_token, tagged_label, gold_label, "FP"))
+							l_logger.info("[%s] \"%s\"=> tagged: %s / gold: %s"%("FP",tagged_token, tagged_label, gold_label))
 				fp += p_fp
 				tp += p_tp
 				fn += p_fn
@@ -408,8 +413,6 @@ def calc_fscore(d_errors):
 			return 0
 		else:
 			return 2*(float(prec * rec) / float(prec + rec))
-	
-
 class CrossEvaluator(SimpleEvaluator):
 	"""
 	>>> import settings #doctest: +SKIP
@@ -442,7 +445,7 @@ def __init__(self,extractors,iob_test_file,culling_size=None,fold_number=10,eval
 	
 	def create_datasets(self):
 		"""
-		docstring for create_datasets
+		TODO
 		"""
 		
 		from miguno.partitioner import *
@@ -465,22 +468,11 @@ def create_datasets(self):
 	
 	def run(self):
 		"""
-		docstring for run
-		
-		TODO:
-			for each iteration
-				for each engine (extractor)
-					write to file the train set
-					write to file the test set
-					evaluate
-					append to 
-						results[extractors[str(extractor_1)]][round-n][fscore]
-						results[extractors[str(extractor_1)]][round-n][prec]
-						results[extractors[str(extractor_1)]][round-n][...]
-		
+		TODO		
 		"""
 		iterations = []
 		results = {}
+		results_by_entity = {}
 		# first lets' create test and train set for each iteration
 		for x,iter in enumerate(self.dataSets_iterator):
 			self.logger.info("Iteration %i"%(x+1))
@@ -497,38 +489,47 @@ def run(self):
 		# let's go through all the iterations
 		for i,iter in enumerate(iterations):
 			results["iter-%i"%(i+1)] = {}
-			for n,extractor_settings in enumerate(self.extractors):
-					results["iter-%i"%(i+1)]["extractor-%i"%(n+1)] = {}
-					self.logger.info("Running iteration #%i with extractor #%i"%(i+1,n+1))
-					self.logger.info(extractor_settings)
-					train_file="%sfold_%i.train"%(self.evaluation_dir,i+1)
-					test_file="%sfold_%i.test"%(self.evaluation_dir,i+1)
+			results_by_entity["iter-%i"%(i+1)] = {}
+			train_file="%sfold_%i.train"%(self.evaluation_dir,i+1)
+			test_file="%sfold_%i.test"%(self.evaluation_dir,i+1)
+			IO.write_iob_file(iter[0],train_file)
+			IO.write_iob_file(iter[1],test_file)
+			# the following line is a bit of a workaround
+			# to avoid recomputing the features when training
+			# each new classifier, I take them from the file created
+			# to train the CRF model (which should always be the first extractor
+			# to be evaluated).
+			filename = "%sfold_%i.train.train"%(self.extractors[0][1].TEMP_DIR,(i+1))
+			f=codecs.open(filename,'r','utf-8')
+			data = f.read()
+			f.close()
+			feature_sets=[[[token.split('\t')[:len(token.split('\t'))-1],token.split('\t')[len(token.split('\t'))-1:]] for token in instance.split('\n')] for instance in data.split('\n\n')]
+			order = FeatureExtractor().get_feature_order()
+			labelled_feature_sets=[]
+			for instance in feature_sets:
+				for token in instance:
+					temp = [{order[n]:feature for n,feature in enumerate(token[0])},token[1][0]]
+					labelled_feature_sets.append(temp)
+			self.logger.info("read %i labelled instances"%len(feature_sets))
+			for n,extractor in enumerate(self.extractors):
+					extractor_settings = extractor[1]
+					extractor_name = extractor[0]
+					results["iter-%i"%(i+1)][extractor_name] = {}
+					self.logger.info("Running iteration #%i with extractor %s"%(i+1,extractor_name))
 					self.logger.info(train_file)
 					self.logger.info(test_file)
-					import codecs
-					file = codecs.open(train_file,'w','utf-8')
-					if(len(iter[0][0][0])==2):
-						tmp = [[("%s\t%s"%(token[0],token[1]))for token in instance] for instance in iter[0]]
-					else:
-						tmp = [[("%s\t%s\t%s"%(token[0],token[1],token[2]))for token in instance] for instance in iter[0]]
-					tmp = ["\n".join(x) for x in tmp]
-					to_write = "\n\n".join(tmp)
-					file.write(to_write)
-					file.close()
-					file = codecs.open(test_file,'w','utf-8')
-					if(len(iter[0][0][0])==2):
-						tmp = [[("%s\t%s"%(token[0],token[1]))for token in instance] for instance in iter[1]]
-					else:
-						tmp = [[("%s\t%s\t%s"%(token[0],token[1],token[2]))for token in instance] for instance in iter[1]]
-					tmp = ["\n".join(x) for x in tmp]
-					to_write = "\n\n".join(tmp)
-					file.write(to_write)
-					file.close()
+					self.logger.info(extractor_settings)
 					extractor_settings.DATA_FILE = train_file
-					extractor = citation_extractor(extractor_settings)
-					se = SimpleEvaluator([extractor,],iob_file=test_file)
-					results["iter-%i"%(i+1)]["extractor-%i"%(n+1)] = se.eval()[str(extractor)]
-		return results	
+					if(extractor_settings.CLASSIFIER is not None):
+						extractor = citation_extractor(extractor_settings, extractor_settings.CLASSIFIER,labelled_feature_sets)
+					else:
+						extractor = citation_extractor(extractor_settings)
+					self.logger.info(extractor.classifier)
+					se = SimpleEvaluator([(extractor_name, extractor),],iob_file=test_file)
+					results["iter-%i"%(i+1)][extractor_name] = se.eval()[extractor_name][0]
+					results_by_entity["iter-%i"%(i+1)][extractor_name] = SimpleEvaluator.calc_stats_by_entity(se.eval()[extractor_name][1])
+					#self.logger.info(results_by_entity["iter-%i"%(i+1)][extractor_name])
+		return results,results_by_entity	
 	
 if __name__ == "__main__":
 	#Usage example: python eval.py aph_data_100_positive/ out/
diff --git a/citation_extractor/process.py b/citation_extractor/process.py
index 796b2942..368b1f1c 100644
--- a/citation_extractor/process.py
+++ b/citation_extractor/process.py
@@ -1,4 +1,58 @@
+# -*- coding: utf-8 -*-
+# author: Matteo Romanello, matteo.romanello@gmail.com
+
 import sys
+import logging
+
+global logger
+logger = logging.getLogger()
+
+def recover_segmentation_errors(text,abbreviation_list,verbose=False):
+	"""
+
+	Pretty straightforward heuristic here:
+	if a line of text contains one token, which matches against a list of abbreviations
+	assume that after this token there shouldn't be a sentence break; the same for
+	the last token of a line consisting of more than one token.
+
+	>> import  codecs
+	>> abbrev_file = "data/abbreviations_all_in_one.txt"
+	>> abbrev = codecs.open(abbrev_file).read().split('\n')
+	>> text_file = 'data/txt/ocr_10.2307_40231021.txt'
+	>> text = codecs.open(text_file,'r','utf-8').read()
+	>> recover_segmentation_errors(text,abbrev,verbose=True)
+	"""
+	def is_abbreviation(token,abbreviations):
+		return token in abbreviations	
+	output = []
+	text_lines = text.split('\n')
+	if(verbose):
+		print >> sys.stderr, "Input text has %i lines"%len(text_lines)
+	for line in text_lines:
+	    tokens=line.split()
+	    if(len(tokens)==1):
+	    	output+=tokens
+	        if(not is_abbreviation(tokens[0],abbreviation_list)):
+	        	output.append('\n')
+	        else:
+	        	if(verbose):
+	        		print >> sys.stderr,"%s is an abbreviation"%tokens[0]
+	    else:
+	    	output+=tokens
+	    	try:
+	    		last_token = tokens[len(tokens)-1]
+	    		if(not is_abbreviation(last_token,abbreviation_list)):
+	    			output.append('\n')
+	    		else:
+	    			if(verbose):
+	    				print >> sys.stderr,"%s is an abbreviation"%last_token
+	    	except Exception, e:
+	    		pass
+	output_text = " ".join(output)
+	if(verbose):
+		print >> sys.stderr, "Output text has %i lines"%len(output_text.split('\n'))
+		print >> sys.stderr, "%i line were breaks recovered"%(len(text_lines)-len(output_text.split('\n')))
+	return output_text
 
 def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=None):
 	"""docstring for create_taggers"""
@@ -6,7 +60,7 @@ def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=Non
 	import os
 	os.environ["TREETAGGER"]=treetagger_dir
 	lang_codes = {
-		'en':('english','latin-1'),
+		'en':('english','utf8'),
 		'it':('italian','utf8'),
 		'es':('spanish','utf8'),
 		'de':('german','utf8'),
@@ -20,6 +74,27 @@ def get_taggers(treetagger_dir = '/Applications/treetagger/cmd/',abbrev_file=Non
 			raise e
 	return taggers	
 
+def get_extractor(settings):
+	"""
+	Instantiate, train and return a Citation_Extractor. 
+	"""
+	import sys
+	import citation_extractor as citation_extractor_module
+	from citation_extractor.core import citation_extractor
+	from citation_extractor.eval import IO
+	ce = None
+	try:
+		logger.info("Using CitationExtractor v. %s"%citation_extractor_module.__version__)
+		train_instances = []
+		for directory in settings.DATA_DIRS:
+		    train_instances += IO.read_iob_files(directory,extension=".txt")
+		logger.info("Training data: found %i directories containing %i  sentences and %i tokens"%(len(settings.DATA_DIRS),len(train_instances),IO.count_tokens(train_instances)))
+		ce = citation_extractor(settings)
+	except Exception, e:
+		print e
+	finally:
+		return ce
+
 def detect_language(text):
 	"""
 	Detect language of a notice by using the module guess_language.
@@ -51,6 +126,36 @@ def create_instance_tokenizer(train_dirs=[("/Users/56k/phd/code/APh/corpus/txt/"
                 train_text += [codecs.open(file,'r','utf-8').read() for file in glob.glob( os.path.join(dir[0], '*%s'%dir[1]))]
         return PunktSentenceTokenizer(sep.join(train_text))
 
+def compact_abbreviations(abbreviation_dir):
+	"""
+	process several files with abbreviations
+	chain them together and write them to a file
+	"""
+	fname = "%s%s"%(abbreviation_dir,"kb_abbrevs.txt")
+	import codecs
+	f = codecs.open(fname,'w','utf-8')
+	abbrvs = get_abbreviations_from_knowledge_base()
+	f.write("\n".join(abbrvs))
+	f.close()
+	abbreviations = []
+	files = [
+		fname
+		,"/Applications/TextPro1.5.2/SentencePro/bin/dict/ita/abbreviations.txt"
+		,"/Applications/TextPro1.5.2/SentencePro/bin/dict/eng/abbreviations.txt"
+		,"/Applications/TextPro1.5.2/SentencePro/bin/dict/ita/no_split_abbreviations.txt"
+		,"/Applications/TextPro1.5.2/SentencePro/bin/dict/eng/no_split_abbreviations.txt"
+	]
+	for fn in files:
+		f = codecs.open(fn,'r','utf-8')
+		print >> sys.stderr, "getting abbreviations from %s"%fn
+		abbreviations = abbreviations + [line for line in f.readlines() if not line.startswith("#") and line !=""]
+	abbreviations = sorted(list(set(abbreviations)))
+	fname = "%s%s"%(abbreviation_dir,"abbreviations_all_in_one.txt")
+	f = codecs.open(fname,'w','utf-8')
+	f.write("".join(abbreviations))
+	f.close()
+	return fname,abbreviations
+
 def split_sentences(filename,outfilename=None):
 	"""	
     sentence tokenization
@@ -392,8 +497,10 @@ def tokenize(sentences,taggers, outfilename=None):
 		tok_lang = lang
 		if(tok_lang in ["en*","en**"]):
 			tok_lang = "en"
-		tmp = [result[:2] for result in taggers[tok_lang].tag(sent)]
-		#print >> sys.stderr,"Tokenized sentence %i / %i"%(n,len(sentences))
+		try:
+			tmp = [result[:2] for result in taggers[tok_lang].tag(sent)]
+		except Exception, e:
+			print >> sys.stderr,e
 		iob.append(tmp)
 	return lang,iob
 
@@ -415,7 +522,7 @@ def preprocess(filename,taggers, outputdir, outfilename=None,split_sentence=Fals
 	if(split_sentence):
 		sentences = split_sentences(filename)
 	else:
-		sentences = [text.replace("\n"," ")]
+		sentences = text.split('\n')
 	print >> sys.stderr, "Text was split into %i sentences"%len(sentences)
 	# tokenize
 	lang, iob = tokenize(sentences,taggers)
@@ -440,7 +547,6 @@ def save_scope_annotations(fileid, ann_dir, annotations):
 	t[1] is the label (it doesn't get written to the file)
 	t[2] is the URN, i.e. the content of the annotation
 	if t[2] is None the annotation is skipped
-
 	"""
 	ann_file = "%s%s-doc-1.ann"%(ann_dir,fileid)
 	file_content = open(ann_file,'r').read()
@@ -468,60 +574,102 @@ def tostandoff(iobfile,standoffdir,brat_script):
 	except Exception, e:
 		raise e
 
-def disambiguate_relations(citation_matcher, relations,entities,docid):
+def disambiguate_relations(citation_matcher,relations,entities,docid,fuzzy=False,distance_threshold=3,fill_nomatch_with_bogus_urn=False):
 	"""
-
-	TODO
-
 	Returns:
-		 (u'R5', u'[ Verg. ] catal. 47s', u'urn:cts:TODO:47s')
+		 [(u'R5', u'[ Verg. ] catal. 47s', u'urn:cts:TODO:47s')]
 	"""
 	import re
-	print >> sys.stderr, "Disambiguating the %i relation contained in %s..."%(len(relations), docid)
 	result = []
 	for relation in relations:
 	    relation_type = relations[relation][0]
 	    arg1 = relations[relation][1].split(":")[1]
 	    arg2 = relations[relation][2].split(":")[1]
-	    refauwo=entities[arg1][1]
-	    refauwo=re.sub("[\(, \)]","",refauwo) # TODO move this to CitationParser
+	    citation_string=entities[arg1][1]
 	    scope = entities[arg2][1]
-	    scope = re.sub("\.$","",scope)
-	    scope = re.sub("\,$","",scope)
-	    scope = re.sub("[\(, \)]","",scope)
+	    regex_clean_citstring = r'(« )|( »)|\(|\)|\,'
+	    regex_clean_scope = r'(\(|\)| ?\;$|\.$|\,$)'
+	    citation_string_cleaned = re.sub(regex_clean_citstring,"",citation_string)
+	    scope_cleaned = re.sub(regex_clean_scope,"",scope)
+	    print >> sys.stderr, "Citation_string cleaning: from \'%s\' to \'%s\'"%(citation_string,citation_string_cleaned)
+	    print >> sys.stderr, "Scope cleaning: from \'%s\' to \'%s\'"%(scope,scope_cleaned)
+	    citation_string = citation_string_cleaned
+	    scope = scope_cleaned
 	    try:
-	        urn = citation_matcher.disambiguate(refauwo,scope)[0]
-	        result.append((relation,"%s %s"%(refauwo,scope),urn))
+	        urn = citation_matcher.disambiguate(citation_string,scope,fuzzy=fuzzy,distance_threshold=distance_threshold,cleanup=True)[0]
+	        result.append((relation,"%s %s"%(citation_string,scope),urn))
 	    except Exception, e:
 	    	normalized_scope = scope
 	    	try:
 	    		normalized_scope = citation_matcher._citation_parser.parse(scope)
 	    		normalized_scope = citation_matcher._format_scope(normalized_scope[0]['scp'])
 	    	except Exception, e:
-	    		print >> sys.stderr, e
-	        result.append((relation,"%s %s"%(refauwo,scope),None))
+	    		print e
+	    	if(fill_nomatch_with_bogus_urn):
+	        	result.append((relation,"%s %s"%(citation_string,scope),"urn:cts:TODO:%s"%normalized_scope))
 	return result
-
 def disambiguate_entities(citation_matcher,entities,docid,min_distance_threshold,max_distance_threshold):
+	"""
+
+	When no match is found it's better not to fill with a bogus URN. The
+	reason is that in some cases it's perfectly ok that no match is found. An entity
+	can be valid entity also without having disambiguation information in the groundtruth.
+
+	"""
+	def longestSubstringFinder(string1, string2):
+		"""
+		solution taken from http://stackoverflow.com/questions/18715688/find-common-substring-between-two-strings
+		"""
+		answer = ""
+		len1, len2 = len(string1), len(string2)
+		for i in range(len1):
+			match = ""
+			for j in range(len2):
+				if (i + j < len1 and string1[i + j] == string2[j]):
+					match += string2[j]
+				else:
+					if (len(match) > len(answer)): answer = match
+					match = ""
+		return answer
+	import re
+	from operator import itemgetter
 	print >> sys.stderr, "Disambiguating the %i entities contained in %s..."%(len(entities), docid)
 	result = []
+	matches = []
 	distance_threshold = min_distance_threshold
+	regex_clean_string = r'(« )|( »)|\(|\)|\,'
 	for entity in entities:
 		entity_type = entities[entity][0]
+		string = entities[entity][1].encode("utf-8")
+		cleaned_string = re.sub(regex_clean_string,"",string)
+		print >> sys.stderr, "String cleaning: from \'%s\' to \'%s\'"%(string,cleaned_string)
+		string = cleaned_string
 		if entity_type == "AAUTHOR":
-			string = entities[entity][1]
 			matches = citation_matcher.matches_author(string,True,distance_threshold)
 			while(matches is None and distance_threshold <= max_distance_threshold):
 				distance_threshold+=1
 				matches = citation_matcher.matches_author(string,True,distance_threshold)
-			if(matches is not None):
-				result.append((entity, string ,matches[0][0]))
 		elif(entity_type == "AWORK"):
-			string = entities[entity][1]
 			matches = citation_matcher.matches_work(string,True,distance_threshold)
 			while(matches is None and distance_threshold <= max_distance_threshold):
 				distance_threshold+=1
 				matches = citation_matcher.matches_work(string,True,distance_threshold)
-			if(matches is not None):
-				result.append((entity, string ,matches[0][0]))
+		if(matches is not None and (entity_type == "AAUTHOR" or entity_type == "AWORK")):
+			lowest_score = 1000
+			for match in matches:
+			    score = match[2]
+			    if(score < lowest_score):
+			        lowest_score = score
+			filtered_matches = [match for match in matches if match[2]==lowest_score]
+			filtered_matches = sorted(filtered_matches, key =itemgetter(2))
+			best_match = ("",None)
+			if(lowest_score > 0):
+				for match in filtered_matches:
+				    lcs = longestSubstringFinder(match[1],string)
+				    if(len(lcs)>len(best_match[0])):
+				        best_match = (lcs,match)
+				if(best_match[1] is not None):
+					result.append((entity,string,best_match[1][0]))
+			else:
+				result.append((entity, string ,filtered_matches[0][0]))
 	return result
\ No newline at end of file
diff --git a/citation_extractor/settings/__init__.pyc b/citation_extractor/settings/__init__.pyc
index 84447dc35907440a5f5368f6f917a0d816fde85e..e7e81df166c435b6e71bc3edc854aadd684b999a 100644
GIT binary patch
delta 21
ccmb=+W@rA)%XRViwTbLzEFugH3=>1N08lgrVE_OC

delta 40
vcmd0<V`u)%%T;?LbRxT%xH<y^Lvm(GVo7FxUVLgrNl{{QNq&+3L`N+E2SE*W

diff --git a/citation_extractor/settings/base_settings.pyc b/citation_extractor/settings/base_settings.pyc
index d1a2c682f220a941e82a4bffd1455f9db6a50f95..80412bf75d126bace46ed684b49b6603a724437d 100644
GIT binary patch
delta 117
zcmZ3){gjiP`7<xq#pBmDvOi~v4`O6sU<hIY5zHWh1w^od2sRMG4k9=}1Sg2#0ukIG
gf(JzKf(Skk!4Dz?K!hNW&;T1M1Y~XIVwu4R0CDvX5C8xG

delta 308
zcmaFLxrm#c`7<xqsZXIB*`G7j7c((1FgP1RD5GK?ATKp7Ej~FhFD2eEJ`r6Y301%d
zU4>CRnhIld6~-tk>Vejnper#!Q36z9imt*GU4<FC3NxSzE@aP`qjD`ETsfFuEj1W{
Rs*AaiwHO*~_GO;I2mpx_SI7VW