MichaelKatsoulis
diff --git a/‎.gitignore
+11 b/‎.gitignore
+11
diff --git a/‎Scraderlatestnews.csv
+281-123 b/‎Scraderlatestnews.csv
+281-123
diff --git a/‎ScraderwithSentiment.csv
+281-45 b/‎ScraderwithSentiment.csv
+281-45
diff --git a/‎checked_articles.py
+5-5 b/‎checked_articles.py
+5-5
diff --git a/‎init_url_terms.py
+67 b/‎init_url_terms.py
+67
diff --git a/‎mongod.conf
+75 b/‎mongod.conf
+75
diff --git a/‎refill.py
+64 b/‎refill.py
+64
@@ -1,3 +1,14 @@
 *.swp
 .idea/
 *.pyc
+algorithm.log
+cleardb.py
+dbdata/
+lakas.py
+lakis.py
+mock.py
+scrader.log
+scraper_logs.log
+tsak.py
+
+
@@ -5,9 +5,9 @@
 dbcli = MongoClient()
 db = dbcli['scrader']
 collection = db['dev_articles']
-cursor = list(collection.find({'checked': True}))
-dataframe = pd.Dataframe(cursor)
-if not os.path.isfile('news.csv'):
-    dataframe.to_csv('news.csv', encoding='utf-8')
+cursor = list(collection.find({}))
+dataframe = pd.DataFrame(cursor)
+if not os.path.isfile('dev_news.csv'):
+    dataframe.to_csv('dev_news.csv', encoding='utf-8')
 else:  # else it exists so append without writing the header
-    dataframe.to_csv('news.csv', mode='a', header=False, encoding='utf-8')
+    dataframe.to_csv('dev_news.csv', mode='a', header=False, encoding='utf-8')
@@ -0,0 +1,67 @@
+import csv
+from pymongo import MongoClient
+import copy
+dbcli = MongoClient()
+db = dbcli['scrader']
+
+collection = db['url_terms']
+with open('LIST_OF_URLS.csv') as csvfile:
+	reader = csv.DictReader(csvfile)
+	url_terms = []
+	for url_term in reader:
+	    term = url_term.get('URL TERMS')
+            url_terms.append(term)
+url_dict = {'url_terms': url_terms}
+collection.insert_one(url_dict)
+#url_list = list(collection.find({}, {'_id': False}))
+#urls = url_list[0].get('url_terms')
+#print(len(urls))
+#print urls[0]
+#print urls[-1]
+collection = db['scraper_companies']
+companies = []
+company_dict = {}
+synonims_keys = ['SYNOMYM 1', 'SYNOMYM 2', 'SYNOMYM 3', 'SYNOMYM 4']
+with open('COMPANY_NAMES.csv') as csvfile:
+       reader = csv.DictReader(csvfile)
+       for company in reader:
+           if company.get('COMPANY NAMES') != '':
+                #print company.get('COMPANY NAMES')
+           	term = company.get('COMPANY NAMES')
+                new_comp_dict = copy.deepcopy(company_dict)
+                new_comp_dict['company_name'] = term
+                new_comp_dict['synonims'] = []
+                new_comp_dict['url_terms'] = []
+                new_comp_dict['url_terms'].append(company.get('URL TERMS'))
+                for syn in synonims_keys:
+                        #print syn
+                        #print(company.get(syn))
+                	if company.get(syn) != "":
+                          if  company.get(syn) is not None:
+                                #print company.get(syn)
+                        	new_comp_dict['synonims'].append(company.get(syn))
+                new_comp_dict['synonims'].append(term)
+                companies.append(new_comp_dict)
+           else:
+                #print(company.get('URL TERMS'))
+           	new_comp_dict['url_terms'].append(company.get('URL TERMS'))
+
+collection.insert_many(companies)
+print(len(companies))
+print companies[76]
+print companies[32]
+print companies[900]
+print companies[1876]
+print companies[632]
+print companies[1899]
+
+#scraper_companies = list(collection.find({}, {'_id': False}))
+#print len(scraper_companies)
+#print scraper_companies[0]
+#print scraper_companies[-1]
+
+#for comp in companies:
+#	if comp.get('company_name') == 'Citigroup':
+#        	print comp.get('synonims')
+#        if comp.get('company_name') == 'Royal Dutch Shell':
+#               print comp.get('synonims')               
@@ -0,0 +1,75 @@
+# mongod.conf
+
+# Where to store the data.
+
+# Note: if you run mongodb as a non-root user (recommended) you may
+# need to create and set permissions for this directory manually,
+# e.g., if the parent directory isn't mutable by the mongodb user.
+dbpath=/var/lib/mongodb
+
+#where to log
+logpath=/var/log/mongodb/mongod.log
+
+logappend=true
+
+port = 27017
+
+# Listen to local interface only. Comment out to listen on all interfaces.
+bind_ip = 127.0.0.1
+
+# Disables write-ahead journaling
+# nojournal = true
+
+# Enables periodic logging of CPU utilization and I/O wait
+#cpu = true
+
+# Turn on/off security.  Off is currently the default
+#noauth = true
+#auth = true
+
+# Verbose logging output.
+#verbose = true
+
+# Inspect all client data for validity on receipt (useful for
+# developing drivers)
+#objcheck = true
+
+# Enable db quota management
+#quota = true
+
+# Set oplogging level where n is
+#   0=off (default)
+#   1=W
+#   2=R
+#   3=both
+#   7=W+some reads
+#diaglog = 0
+
+# Ignore query hints
+#nohints = true
+
+# Enable the HTTP interface (Defaults to port 28017).
+#httpinterface = true
+
+# Turns off server-side scripting.  This will result in greatly limited
+# functionality
+#noscripting = true
+
+# Turns off table scans.  Any query that would do a table scan fails.
+#notablescan = true
+
+# Disable data file preallocation.
+#noprealloc = true
+
+# Specify .ns file size for new databases.
+# nssize = <size>
+
+# Replication Options
+
+# in replicated mongo databases, specify the replica set name here
+#replSet=setname
+# maximum size in megabytes for replication operation log
+#oplogSize=1024
+# path to a key file storing authentication info for connections
+# between replica set members
+#keyFile=/path/to/keyfile
@@ -0,0 +1,64 @@
+import os
+import copy
+from pymongo import MongoClient
+import pandas as pd
+from bson.objectid import ObjectId
+# converts mongodb collection to csv
+
+
+def convert_collection_to_df(mongo_cli, collection, field1, match1,
+                             field2, match2):
+    dbcli = mongo_cli
+    scrader_db = dbcli['scrader']
+    cursor = scrader_db[collection].find({'$and': [{field1: {'$in': match1}},
+                                         {field2: {'$in': match2}}]})
+    cursor_list = list(cursor)
+    copied_list = copy.deepcopy(cursor_list)
+    
+    for d in cursor_list:
+    	title = d['title']
+        times = 0
+    	for dic in copied_list:
+	    if dic['title']==title:
+                times += 1
+                if times==2:
+                    print(title)
+	            copied_list.remove(dic)
+                    break	
+    
+    print(len(copied_list))
+    somelist = []
+    somedict = {'direction':'', 'title':''}
+    for article in copied_list:
+	newart = copy.deepcopy(somedict)
+        #scrader_db[collection].\
+         #   update({"_id": ObjectId(article['_id'])},
+          #        {'$set': {'appended': True}})
+        #article.pop('_id', None)
+	newart['direction'] = article.get('direction')
+	newart['title'] = article.get('title')
+	somelist.append(newart)
+
+    # scrader_db[collection].update({'$and': [{field1: {'$in': match1}},
+    #                                         {field2: {'$in': match2}}]},
+    #                               {'$set': {'appended': True}},
+    #                               True, True)
+    return pd.DataFrame(somelist)
+
+
+def main():
+    dataframe = convert_collection_to_df(MongoClient(), 'dev_articles',
+                                                        'checked', [True],
+                                                        'User', ['Ptrs'])
+    # if file does not exist write header
+    if not os.path.isfile('newdata.csv'):
+        #pass
+        dataframe.to_csv('newdata.csv', encoding='utf-8', index=False)
+    else:  # else it exists so append without writing the header
+        #pass
+        dataframe.to_csv('newdata.csv', mode='a', header=False, index=False,
+                         encoding='utf-8')
+
+
+if __name__ == '__main__':
+    main()