Skip to content

Commit 9ccf349

Browse files
add some scripts and csvs
1 parent 82c8d16 commit 9ccf349

10 files changed

+2063
-416
lines changed

.gitignore

+11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
11
*.swp
22
.idea/
33
*.pyc
4+
algorithm.log
5+
cleardb.py
6+
dbdata/
7+
lakas.py
8+
lakis.py
9+
mock.py
10+
scrader.log
11+
scraper_logs.log
12+
tsak.py
13+
14+

Scraderlatestnews.csv

+281-123
Large diffs are not rendered by default.

ScraderwithSentiment.csv

+281-45
Large diffs are not rendered by default.

checked_articles.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
dbcli = MongoClient()
66
db = dbcli['scrader']
77
collection = db['dev_articles']
8-
cursor = list(collection.find({'checked': True}))
9-
dataframe = pd.Dataframe(cursor)
10-
if not os.path.isfile('news.csv'):
11-
dataframe.to_csv('news.csv', encoding='utf-8')
8+
cursor = list(collection.find({}))
9+
dataframe = pd.DataFrame(cursor)
10+
if not os.path.isfile('dev_news.csv'):
11+
dataframe.to_csv('dev_news.csv', encoding='utf-8')
1212
else: # else it exists so append without writing the header
13-
dataframe.to_csv('news.csv', mode='a', header=False, encoding='utf-8')
13+
dataframe.to_csv('dev_news.csv', mode='a', header=False, encoding='utf-8')

init_url_terms.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import csv
2+
from pymongo import MongoClient
3+
import copy
4+
dbcli = MongoClient()
5+
db = dbcli['scrader']
6+
7+
collection = db['url_terms']
8+
with open('LIST_OF_URLS.csv') as csvfile:
9+
reader = csv.DictReader(csvfile)
10+
url_terms = []
11+
for url_term in reader:
12+
term = url_term.get('URL TERMS')
13+
url_terms.append(term)
14+
url_dict = {'url_terms': url_terms}
15+
collection.insert_one(url_dict)
16+
#url_list = list(collection.find({}, {'_id': False}))
17+
#urls = url_list[0].get('url_terms')
18+
#print(len(urls))
19+
#print urls[0]
20+
#print urls[-1]
21+
collection = db['scraper_companies']
22+
companies = []
23+
company_dict = {}
24+
synonims_keys = ['SYNOMYM 1', 'SYNOMYM 2', 'SYNOMYM 3', 'SYNOMYM 4']
25+
with open('COMPANY_NAMES.csv') as csvfile:
26+
reader = csv.DictReader(csvfile)
27+
for company in reader:
28+
if company.get('COMPANY NAMES') != '':
29+
#print company.get('COMPANY NAMES')
30+
term = company.get('COMPANY NAMES')
31+
new_comp_dict = copy.deepcopy(company_dict)
32+
new_comp_dict['company_name'] = term
33+
new_comp_dict['synonims'] = []
34+
new_comp_dict['url_terms'] = []
35+
new_comp_dict['url_terms'].append(company.get('URL TERMS'))
36+
for syn in synonims_keys:
37+
#print syn
38+
#print(company.get(syn))
39+
if company.get(syn) != "":
40+
if company.get(syn) is not None:
41+
#print company.get(syn)
42+
new_comp_dict['synonims'].append(company.get(syn))
43+
new_comp_dict['synonims'].append(term)
44+
companies.append(new_comp_dict)
45+
else:
46+
#print(company.get('URL TERMS'))
47+
new_comp_dict['url_terms'].append(company.get('URL TERMS'))
48+
49+
collection.insert_many(companies)
50+
print(len(companies))
51+
print companies[76]
52+
print companies[32]
53+
print companies[900]
54+
print companies[1876]
55+
print companies[632]
56+
print companies[1899]
57+
58+
#scraper_companies = list(collection.find({}, {'_id': False}))
59+
#print len(scraper_companies)
60+
#print scraper_companies[0]
61+
#print scraper_companies[-1]
62+
63+
#for comp in companies:
64+
# if comp.get('company_name') == 'Citigroup':
65+
# print comp.get('synonims')
66+
# if comp.get('company_name') == 'Royal Dutch Shell':
67+
# print comp.get('synonims')

mongod.conf

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# mongod.conf
2+
3+
# Where to store the data.
4+
5+
# Note: if you run mongodb as a non-root user (recommended) you may
6+
# need to create and set permissions for this directory manually,
7+
# e.g., if the parent directory isn't mutable by the mongodb user.
8+
dbpath=/var/lib/mongodb
9+
10+
#where to log
11+
logpath=/var/log/mongodb/mongod.log
12+
13+
logappend=true
14+
15+
port = 27017
16+
17+
# Listen to local interface only. Comment out to listen on all interfaces.
18+
bind_ip = 127.0.0.1
19+
20+
# Disables write-ahead journaling
21+
# nojournal = true
22+
23+
# Enables periodic logging of CPU utilization and I/O wait
24+
#cpu = true
25+
26+
# Turn on/off security. Off is currently the default
27+
#noauth = true
28+
#auth = true
29+
30+
# Verbose logging output.
31+
#verbose = true
32+
33+
# Inspect all client data for validity on receipt (useful for
34+
# developing drivers)
35+
#objcheck = true
36+
37+
# Enable db quota management
38+
#quota = true
39+
40+
# Set oplogging level where n is
41+
# 0=off (default)
42+
# 1=W
43+
# 2=R
44+
# 3=both
45+
# 7=W+some reads
46+
#diaglog = 0
47+
48+
# Ignore query hints
49+
#nohints = true
50+
51+
# Enable the HTTP interface (Defaults to port 28017).
52+
#httpinterface = true
53+
54+
# Turns off server-side scripting. This will result in greatly limited
55+
# functionality
56+
#noscripting = true
57+
58+
# Turns off table scans. Any query that would do a table scan fails.
59+
#notablescan = true
60+
61+
# Disable data file preallocation.
62+
#noprealloc = true
63+
64+
# Specify .ns file size for new databases.
65+
# nssize = <size>
66+
67+
# Replication Options
68+
69+
# in replicated mongo databases, specify the replica set name here
70+
#replSet=setname
71+
# maximum size in megabytes for replication operation log
72+
#oplogSize=1024
73+
# path to a key file storing authentication info for connections
74+
# between replica set members
75+
#keyFile=/path/to/keyfile

refill.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import os
2+
import copy
3+
from pymongo import MongoClient
4+
import pandas as pd
5+
from bson.objectid import ObjectId
6+
# converts mongodb collection to csv
7+
8+
9+
def convert_collection_to_df(mongo_cli, collection, field1, match1,
10+
field2, match2):
11+
dbcli = mongo_cli
12+
scrader_db = dbcli['scrader']
13+
cursor = scrader_db[collection].find({'$and': [{field1: {'$in': match1}},
14+
{field2: {'$in': match2}}]})
15+
cursor_list = list(cursor)
16+
copied_list = copy.deepcopy(cursor_list)
17+
18+
for d in cursor_list:
19+
title = d['title']
20+
times = 0
21+
for dic in copied_list:
22+
if dic['title']==title:
23+
times += 1
24+
if times==2:
25+
print(title)
26+
copied_list.remove(dic)
27+
break
28+
29+
print(len(copied_list))
30+
somelist = []
31+
somedict = {'direction':'', 'title':''}
32+
for article in copied_list:
33+
newart = copy.deepcopy(somedict)
34+
#scrader_db[collection].\
35+
# update({"_id": ObjectId(article['_id'])},
36+
# {'$set': {'appended': True}})
37+
#article.pop('_id', None)
38+
newart['direction'] = article.get('direction')
39+
newart['title'] = article.get('title')
40+
somelist.append(newart)
41+
42+
# scrader_db[collection].update({'$and': [{field1: {'$in': match1}},
43+
# {field2: {'$in': match2}}]},
44+
# {'$set': {'appended': True}},
45+
# True, True)
46+
return pd.DataFrame(somelist)
47+
48+
49+
def main():
50+
dataframe = convert_collection_to_df(MongoClient(), 'dev_articles',
51+
'checked', [True],
52+
'User', ['Ptrs'])
53+
# if file does not exist write header
54+
if not os.path.isfile('newdata.csv'):
55+
#pass
56+
dataframe.to_csv('newdata.csv', encoding='utf-8', index=False)
57+
else: # else it exists so append without writing the header
58+
#pass
59+
dataframe.to_csv('newdata.csv', mode='a', header=False, index=False,
60+
encoding='utf-8')
61+
62+
63+
if __name__ == '__main__':
64+
main()

0 commit comments

Comments
 (0)