mirror of
https://github.com/gryf/pygtktalog.git
synced 2025-12-17 11:30:19 +01:00
443 lines
14 KiB
Python
443 lines
14 KiB
Python
"""
|
|
Project: pyGTKtalog
|
|
Description: Filesystem scan and file automation layer
|
|
Type: core
|
|
Author: Roman 'gryf' Dobosz, gryf73@gmail.com
|
|
Created: 2011-03-27
|
|
"""
|
|
import os
|
|
import re
|
|
from datetime import datetime
|
|
import mimetypes
|
|
|
|
import pycatalog.misc
|
|
from pycatalog.dbobjects import File, Config, TYPE
|
|
from pycatalog.dbcommon import Session
|
|
from pycatalog.logger import get_logger
|
|
from pycatalog.video import Video
|
|
|
|
|
|
LOG = get_logger(__name__)
|
|
RE_FN_START = re.compile(r'(?P<fname_start>'
|
|
r'(\[[^\]]*\]\s)?'
|
|
r'([^(]*)\s'
|
|
r'((\(\d{4}\))\s)?).*'
|
|
r'(\[[A-Fa-f0-9]{8}\])\..*')
|
|
|
|
|
|
class NoAccessError(Exception):
|
|
"""No access exception"""
|
|
pass
|
|
|
|
|
|
class Scan(object):
|
|
"""
|
|
Retrieve and identify all files recursively on given path
|
|
"""
|
|
def __init__(self, path):
|
|
"""
|
|
Initialize
|
|
@Arguments:
|
|
@path - string with path to be added to topmost node (root)
|
|
"""
|
|
self.abort = False
|
|
self.path = path.rstrip(os.path.sep)
|
|
self._files = []
|
|
self._existing_files = [] # for re-use purpose in adding
|
|
self._existing_branch = [] # for branch storage, mainly for updating
|
|
self._session = Session()
|
|
self.files_count = self._get_files_count()
|
|
self.current_count = 0
|
|
|
|
def add_files(self, engine=None):
|
|
"""
|
|
Returns list, which contain object, modification date and file
|
|
size.
|
|
"""
|
|
self._files = []
|
|
self._existing_branch = []
|
|
LOG.debug("given path: %s", self.path)
|
|
|
|
# See, if file exists. If not it would raise OSError exception
|
|
os.stat(self.path)
|
|
|
|
if not os.access(self.path, os.R_OK | os.X_OK) \
|
|
or not os.path.isdir(self.path):
|
|
raise NoAccessError("Access to %s is forbidden" % self.path)
|
|
|
|
directory = os.path.basename(self.path)
|
|
path = os.path.dirname(self.path)
|
|
|
|
if not self._recursive(None, directory, path, 0):
|
|
return None
|
|
|
|
# add only first item from _files, because it is a root of the other,
|
|
# so other will be automatically added aswell.
|
|
self._session.add(self._files[0])
|
|
self._session.commit()
|
|
return self._files
|
|
|
|
def get_all_children(self, node_id, engine):
|
|
"""
|
|
Get children by pure SQL
|
|
|
|
Starting from sqlite 3.8.3 it is possile to do this operation as a
|
|
one query using WITH statement. For now on it has to be done in
|
|
application.
|
|
"""
|
|
query = "select id from files where parent_id=? and type=1"
|
|
query2 = "select id from files where parent_id in (%s)"
|
|
|
|
row = ((node_id,),)
|
|
all_ids = []
|
|
|
|
def req(obj):
|
|
"""Requrisve function for gathering all child ids for given node"""
|
|
for line in obj:
|
|
all_ids.append(line[0])
|
|
res = engine.execute(query, (line[0],)).fetchall()
|
|
if res:
|
|
req(res)
|
|
|
|
req(row)
|
|
|
|
sql = query2 % ",".join("?" * len(all_ids))
|
|
all_ids = [row_[0] for row_ in engine
|
|
.execute(sql, tuple(all_ids))
|
|
.fetchall()]
|
|
|
|
all_obj = []
|
|
# number of objects to retrieve at once. Limit is 999. Let's do a
|
|
# little bit below.
|
|
num = 900
|
|
steps = len(all_ids) // num + 1
|
|
for step in range(steps):
|
|
all_obj.extend(self._session
|
|
.query(File)
|
|
.filter(File.id
|
|
.in_(all_ids[step * num:step * num + num]))
|
|
.all())
|
|
return all_obj
|
|
|
|
def update_files(self, node_id, engine=None):
|
|
"""
|
|
Updtate DB contents of provided node.
|
|
"""
|
|
self.current_count = 0
|
|
old_node = self._session.query(File).get(node_id)
|
|
if old_node is None:
|
|
LOG.warning("No such object in db: %s", node_id)
|
|
return
|
|
parent = old_node.parent
|
|
|
|
self._files = []
|
|
|
|
if engine:
|
|
LOG.debug("Getting all File objects via SQL")
|
|
self._existing_branch = self.get_all_children(node_id, engine)
|
|
else:
|
|
LOG.debug("Getting all File objects via ORM (yeah, it SLOW)")
|
|
self._existing_branch = old_node.get_all_children()
|
|
|
|
self._existing_branch.insert(0, old_node)
|
|
|
|
# Break the chain of parent-children relations
|
|
LOG.debug("Make them orphans")
|
|
for fobj in self._existing_branch:
|
|
fobj.parent = None
|
|
|
|
update_path = os.path.join(old_node.filepath, old_node.filename)
|
|
# gimme a string. unicode can't handle strange filenames in paths, so
|
|
# in case of such, better get me a byte string. It is not perfect
|
|
# though, since it WILL crash if the update_path would contain some
|
|
# unconvertable characters.
|
|
update_path = update_path
|
|
|
|
# refresh objects
|
|
LOG.debug("Refreshing objects")
|
|
self._get_all_files()
|
|
|
|
LOG.debug("path for update: %s", update_path)
|
|
|
|
# See, if file exists. If not it would raise OSError exception
|
|
os.stat(update_path)
|
|
|
|
if not os.access(update_path, os.R_OK | os.X_OK) \
|
|
or not os.path.isdir(update_path):
|
|
LOG.error("Access to %s is forbidden", update_path)
|
|
raise NoAccessError("Access to %s is forbidden" % update_path)
|
|
|
|
directory = os.path.basename(update_path)
|
|
path = os.path.dirname(update_path)
|
|
|
|
if not self._recursive(parent, directory, path, 0):
|
|
return None
|
|
|
|
# update branch
|
|
# self._session.merge(self._files[0])
|
|
LOG.debug("Deleting objects whitout parent: %s",
|
|
str(self._session.query(File)
|
|
.filter(File.parent.is_(None)).all()))
|
|
self._session.query(File).filter(File.parent.is_(None)).delete()
|
|
|
|
self._session.commit()
|
|
return self._files
|
|
|
|
def _gather_information(self, fobj):
|
|
"""
|
|
Try to guess type and gather information about File object if possible
|
|
"""
|
|
mimedict = {'audio': self._audio,
|
|
'video': self._video,
|
|
'image': self._image}
|
|
extdict = {'.mkv': 'video', # TODO: move this to config/plugin(?)
|
|
'.rmvb': 'video',
|
|
'.ogm': 'video',
|
|
'.ogv': 'video'}
|
|
|
|
fp = os.path.join(fobj.filepath, fobj.filename)
|
|
|
|
mimeinfo = mimetypes.guess_type(fp)
|
|
if mimeinfo[0]:
|
|
mimeinfo = mimeinfo[0].split("/")[0]
|
|
|
|
ext = os.path.splitext(fp)[1]
|
|
|
|
if mimeinfo and mimeinfo in mimedict:
|
|
mimedict[mimeinfo](fobj, fp)
|
|
elif ext and ext in extdict:
|
|
mimedict[extdict[ext]](fobj, fp)
|
|
else:
|
|
LOG.debug("Filetype not supported %s %s", str(mimeinfo), fp)
|
|
pass
|
|
|
|
def _audio(self, fobj, filepath):
|
|
# LOG.warning('audio')
|
|
return
|
|
|
|
def _image(self, fobj, filepath):
|
|
# LOG.warning('image')
|
|
return
|
|
|
|
def _video(self, fobj, filepath):
|
|
"""
|
|
Make captures for a movie. Save it under uniq name.
|
|
"""
|
|
vid = Video(filepath)
|
|
fobj.description = vid.get_formatted_tags()
|
|
|
|
def _get_all_files(self):
|
|
"""Gather all File objects"""
|
|
self._existing_files = self._session.query(File).all()
|
|
|
|
def _mk_file(self, fname, path, parent, ftype=TYPE['file']):
|
|
"""
|
|
Create and return File object
|
|
"""
|
|
fullpath = os.path.join(path, fname)
|
|
|
|
if ftype == TYPE['link']:
|
|
fname = fname + " -> " + os.readlink(fullpath)
|
|
|
|
fob = {'filename': fname,
|
|
'path': path,
|
|
'ftype': ftype}
|
|
try:
|
|
fob['date'] = datetime.fromtimestamp(os.stat(fullpath).st_mtime)
|
|
fob['size'] = os.stat(fullpath).st_size
|
|
except OSError:
|
|
# in case of dead softlink, we will have no time and size
|
|
fob['date'] = None
|
|
fob['size'] = 0
|
|
|
|
fobj = self._get_old_file(fob, ftype)
|
|
|
|
if fobj:
|
|
LOG.debug("found existing file in db: %s", str(fobj))
|
|
# TODO: update whole tree sizes (for directories/discs)
|
|
fobj.size = fob['size']
|
|
fobj.filepath = fob['path']
|
|
fobj.type = fob['ftype']
|
|
else:
|
|
fobj = File(**fob)
|
|
# SLOW. Don't do this. Checksums has no value eventually
|
|
# fobj.mk_checksum()
|
|
|
|
if parent is None:
|
|
fobj.parent_id = 1
|
|
else:
|
|
fobj.parent = parent
|
|
|
|
self._files.append(fobj)
|
|
|
|
return fobj
|
|
|
|
def _non_recursive(self, parent, fname, path, size):
|
|
"""
|
|
Do the walk through the file system. Non recursively, since it's
|
|
slow as hell.
|
|
@Arguments:
|
|
@parent - directory File object which is parent for the current
|
|
scope
|
|
@fname - string that hold filename
|
|
@path - full path for further scanning
|
|
@size - size of the object
|
|
"""
|
|
fullpath = os.path.join(path, fname)
|
|
parent = self._mk_file(fname, path, parent, TYPE['dir'])
|
|
parent.size = 0
|
|
parent.type = TYPE['dir']
|
|
|
|
for root, dirs, files in os.walk(fullpath):
|
|
for dir_ in dirs:
|
|
pass
|
|
|
|
for file_ in files:
|
|
self.current_count += 1
|
|
stat = os.lstat(os.path.join(root, file_))
|
|
parent.size += stat.st_size
|
|
|
|
# TODO: finish that up
|
|
|
|
def _recursive(self, parent, fname, path, size):
|
|
"""
|
|
Do the walk through the file system
|
|
@Arguments:
|
|
@parent - directory File object which is parent for the current
|
|
scope
|
|
@fname - string that hold filename
|
|
@path - full path for further scanning
|
|
@size - size of the object
|
|
"""
|
|
if self.abort:
|
|
return False
|
|
|
|
fullpath = os.path.join(path, fname)
|
|
|
|
parent = self._mk_file(fname, path, parent, TYPE['dir'])
|
|
|
|
parent.size = _get_dirsize(fullpath)
|
|
parent.type = TYPE['dir']
|
|
|
|
LOG.info("Scanning `%s' [%s/%s]", fullpath, self.current_count,
|
|
self.files_count)
|
|
|
|
root, dirs, files = next(os.walk(fullpath))
|
|
for fname in files:
|
|
fpath = os.path.join(root, fname)
|
|
extension = os.path.splitext(fname)[1]
|
|
self.current_count += 1
|
|
LOG.debug("Processing %s [%s/%s]", fname, self.current_count,
|
|
self.files_count)
|
|
|
|
result = RE_FN_START.match(fname)
|
|
test_ = False
|
|
|
|
if result and extension in ('.jpg', '.gif', '.png'):
|
|
startfrom = result.groupdict()['fname_start']
|
|
matching_files = []
|
|
for fn_ in os.listdir(root):
|
|
if fn_.startswith(startfrom):
|
|
matching_files.append(fn_)
|
|
|
|
if len(matching_files) > 1:
|
|
LOG.debug('found image "%s" in group: %s, skipping', fname,
|
|
str(matching_files))
|
|
test_ = True
|
|
if test_:
|
|
continue
|
|
|
|
if os.path.islink(fpath):
|
|
fob = self._mk_file(fname, root, parent, TYPE['link'])
|
|
else:
|
|
fob = self._mk_file(fname, root, parent)
|
|
existing_obj = self._object_exists(fob)
|
|
|
|
if existing_obj:
|
|
existing_obj.parent = fob.parent
|
|
fob = existing_obj
|
|
else:
|
|
LOG.debug("gather information for %s",
|
|
os.path.join(root, fname))
|
|
self._gather_information(fob)
|
|
size += fob.size
|
|
if fob not in self._existing_files:
|
|
self._existing_files.append(fob)
|
|
|
|
for dirname in dirs:
|
|
dirpath = os.path.join(root, dirname)
|
|
|
|
if not os.access(dirpath, os.R_OK | os.X_OK):
|
|
LOG.info("Cannot access directory %s", dirpath)
|
|
continue
|
|
|
|
if os.path.islink(dirpath):
|
|
fob = self._mk_file(dirname, root, parent, TYPE['link'])
|
|
else:
|
|
LOG.debug("going into %s", os.path.join(root, dirname))
|
|
self._recursive(parent, dirname, fullpath, size)
|
|
|
|
LOG.debug("size of items: %s", parent.size)
|
|
return True
|
|
|
|
def _get_old_file(self, fdict, ftype):
|
|
"""
|
|
Search for object with provided data in dictionary in stored branch
|
|
(which is updating). Return such object on success, remove it from
|
|
list.
|
|
"""
|
|
for index, obj in enumerate(self._existing_branch):
|
|
if ftype == TYPE['link'] and fdict['filename'] == obj.filename:
|
|
return self._existing_branch.pop(index)
|
|
elif fdict['filename'] == obj.filename and \
|
|
fdict['date'] == obj.date and \
|
|
ftype == TYPE['file'] and \
|
|
fdict['size'] in (obj.size, 0):
|
|
obj = self._existing_branch.pop(index)
|
|
obj.size = fdict['size']
|
|
return obj
|
|
elif fdict['filename'] == obj.filename:
|
|
obj = self._existing_branch.pop(index)
|
|
obj.size = fdict['date']
|
|
return obj
|
|
return False
|
|
|
|
def _object_exists(self, fobj):
|
|
"""
|
|
Perform check if current File object already exists in collection. If
|
|
so, return first matching one, None otherwise.
|
|
"""
|
|
for efobj in self._existing_files:
|
|
if efobj.size == fobj.size \
|
|
and efobj.type == fobj.type \
|
|
and efobj.date == fobj.date \
|
|
and efobj.filename == fobj.filename:
|
|
return efobj
|
|
return None
|
|
|
|
def _get_files_count(self):
|
|
"""return size in bytes"""
|
|
count = 0
|
|
for _, _, files in os.walk(str(self.path)):
|
|
count += len(files)
|
|
LOG.debug("count of files: %s", count)
|
|
return count
|
|
|
|
|
|
def _get_dirsize(path):
|
|
"""
|
|
Returns sum of all files under specified path (also in subdirs)
|
|
"""
|
|
|
|
size = 0
|
|
|
|
for root, _, files in os.walk(path):
|
|
for fname in files:
|
|
try:
|
|
size += os.lstat(os.path.join(root, fname)).st_size
|
|
except OSError:
|
|
LOG.warning("Cannot access file %s",
|
|
os.path.join(root, fname))
|
|
LOG.debug("_get_dirsize, %s: %d", path, size)
|
|
return size
|