You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
157 lines
3.6 KiB
Python
157 lines
3.6 KiB
Python
import uuid, os, json, shutil, re
|
|
from collections import defaultdict, deque
|
|
|
|
class TaskManager(object):
|
|
def __init__(self):
|
|
self.tasks = {}
|
|
|
|
def get(self, task_id):
|
|
return self.tasks[task_id]
|
|
|
|
def put(self, task_id, task_data):
|
|
# Persist on disk
|
|
try:
|
|
os.mkdirs("task_data/%s" % task_id)
|
|
except OSError, e:
|
|
pass
|
|
|
|
with open("task_data", "w") as task_file:
|
|
task_file.write(json.dumps(task_data))
|
|
|
|
# Store in RAM
|
|
self.tasks[task_id] = task_data
|
|
|
|
def delete(self, task_id):
|
|
shutil.rmtree("task_data/%s" % task_id, ignore_errors=True)
|
|
del self[task_id]
|
|
|
|
class TaskDistributor(object):
|
|
def __init__(self):
|
|
self.pools = defaultdict(deque)
|
|
self.processing = defaultdict(list)
|
|
|
|
def put(self, pool, task_id):
|
|
self.pools[pool].append(task_id)
|
|
|
|
def get(self, pool):
|
|
task_id = self.pools[pool].popleft()
|
|
self.processing[pool].append(task_id)
|
|
return task_id
|
|
|
|
def get_pools(self, task_id):
|
|
return [pool in self.pools if task_id in pool]
|
|
|
|
def done(self, pool, task_id):
|
|
self.processing[pool].remove(task_id)
|
|
|
|
def fail(self, pool, task_id):
|
|
# Re-add
|
|
self.processing[pool].remove(task_id)
|
|
self.put(pool, task_id)
|
|
|
|
class IsbnProcessor(object):
|
|
def clean(self, isbn):
|
|
isbn = isbn.upper().replace("-", "").replace(" ", "")
|
|
|
|
if len(isbn) == 9: # 9 digit SBN
|
|
isbn = "0" + isbn
|
|
|
|
return isbn
|
|
|
|
def validate(self, isbn):
|
|
isbn = self.clean(isbn)
|
|
|
|
if len(isbn) == 10:
|
|
total = 0
|
|
for i in xrange(0, 9):
|
|
total += (int(isbn[i]) * (10 - i))
|
|
|
|
check_digit = 11 - (total % 11)
|
|
if check_digit == 10:
|
|
check_digit = "X"
|
|
else:
|
|
check_digit = str(check_digit)
|
|
|
|
return (check_digit == isbn[9])
|
|
elif len(isbn) == 13:
|
|
odd = False
|
|
total = 0
|
|
for i in xrange(0, 12):
|
|
if odd:
|
|
total += int(isbn[i])
|
|
else:
|
|
total += int(isbn[i]) * 3
|
|
odd = not odd
|
|
|
|
check_digit = 10 - (total % 10)
|
|
if check_digit == 10:
|
|
check_digit = 0
|
|
check_digit = str(check_digit)
|
|
|
|
return (check_digit == isbn[12])
|
|
else:
|
|
return False
|
|
|
|
class BookTaskClassifier(object):
|
|
def __init__(self):
|
|
self.isbn = IsbnProcessor()
|
|
|
|
def get_pools(self, task_data):
|
|
eligible_pools = []
|
|
|
|
try:
|
|
for isbn in [isbn.strip() for isbn in task_data["book_data"]["ids"]["isbn"]]:
|
|
if self.isbn.validate(isbn) and "isbn" not in task_data["pools_done"]:
|
|
eligible_pools.append("isbn")
|
|
except KeyError, e:
|
|
pass
|
|
|
|
for identifier in ("amazon", "google"):
|
|
try:
|
|
if task_data["book_data"]["ids"][identifier].strip() != "" and identifier not in task_data["pools_done"]:
|
|
eligible_pools.append(identifier)
|
|
except KeyError, e:
|
|
pass
|
|
|
|
if len(eligible_pools) == 0 and "title" not in task_data["pools_done"]:
|
|
eligible_pools.append("title")
|
|
|
|
return eligible_pools
|
|
|
|
class BookTaskManager(object):
|
|
def __init__(self):
|
|
self.manager = TaskManager()
|
|
self.distributor = TaskDistributor()
|
|
self.classifier = BookTaskClassifier()
|
|
|
|
def new(self, book_data):
|
|
task_id = uuid.uuid4()
|
|
task_data = {
|
|
"id": task_id,
|
|
"book_data": book_data,
|
|
"pools_done": [],
|
|
"logs": [],
|
|
"flags": []
|
|
}
|
|
|
|
self.manager.put(task_id, task_data)
|
|
self.enqueue(task_id)
|
|
|
|
def enqueue(self, task_id):
|
|
task_data = self.manager.get(task_id)
|
|
pools = self.classifier.get_pools(task_data)
|
|
|
|
if len(pools) > 1:
|
|
for pool in pools:
|
|
self.distributor.put(pool, task_id)
|
|
else:
|
|
# No more pools to put this into... this is the best we have!
|
|
|
|
def get(self, pool):
|
|
task_id = self.distributor.get(pool)
|
|
return task_id
|
|
|
|
def done(self, pool, task_data):
|
|
self.distributor.done(pool, task_data["id"])
|
|
self.manager.put(task_data["id"], task_data) # conflicts..
|