""" Project: pyGTKtalog Description: Filesystem scan and file automation layer Type: core Author: Roman 'gryf' Dobosz, gryf73@gmail.com Created: 2011-03-27 """ import os import re from datetime import datetime import mimetypes import pycatalog.misc from pycatalog.dbobjects import File, Config, TYPE from pycatalog.dbcommon import Session from pycatalog.logger import get_logger from pycatalog.video import Video LOG = get_logger(__name__) RE_FN_START = re.compile(r'(?P' r'(\[[^\]]*\]\s)?' r'([^(]*)\s' r'((\(\d{4}\))\s)?).*' r'(\[[A-Fa-f0-9]{8}\])\..*') class NoAccessError(Exception): """No access exception""" pass class Scan(object): """ Retrieve and identify all files recursively on given path """ def __init__(self, path): """ Initialize @Arguments: @path - string with path to be added to topmost node (root) """ self.abort = False self.path = path.rstrip(os.path.sep) self._files = [] self._existing_files = [] # for re-use purpose in adding self._existing_branch = [] # for branch storage, mainly for updating self._session = Session() self.files_count = self._get_files_count() self.current_count = 0 def add_files(self, engine=None): """ Returns list, which contain object, modification date and file size. """ self._files = [] self._existing_branch = [] LOG.debug("given path: %s", self.path) # See, if file exists. If not it would raise OSError exception os.stat(self.path) if not os.access(self.path, os.R_OK | os.X_OK) \ or not os.path.isdir(self.path): raise NoAccessError("Access to %s is forbidden" % self.path) directory = os.path.basename(self.path) path = os.path.dirname(self.path) if not self._recursive(None, directory, path, 0): return None # add only first item from _files, because it is a root of the other, # so other will be automatically added aswell. self._session.add(self._files[0]) self._session.commit() return self._files def get_all_children(self, node_id, engine): """ Get children by pure SQL Starting from sqlite 3.8.3 it is possile to do this operation as a one query using WITH statement. For now on it has to be done in application. """ query = "select id from files where parent_id=? and type=1" query2 = "select id from files where parent_id in (%s)" row = ((node_id,),) all_ids = [] def req(obj): """Requrisve function for gathering all child ids for given node""" for line in obj: all_ids.append(line[0]) res = engine.execute(query, (line[0],)).fetchall() if res: req(res) req(row) sql = query2 % ",".join("?" * len(all_ids)) all_ids = [row_[0] for row_ in engine .execute(sql, tuple(all_ids)) .fetchall()] all_obj = [] # number of objects to retrieve at once. Limit is 999. Let's do a # little bit below. num = 900 steps = len(all_ids) // num + 1 for step in range(steps): all_obj.extend(self._session .query(File) .filter(File.id .in_(all_ids[step * num:step * num + num])) .all()) return all_obj def update_files(self, node_id, engine=None): """ Updtate DB contents of provided node. """ self.current_count = 0 old_node = self._session.query(File).get(node_id) if old_node is None: LOG.warning("No such object in db: %s", node_id) return parent = old_node.parent self._files = [] if engine: LOG.debug("Getting all File objects via SQL") self._existing_branch = self.get_all_children(node_id, engine) else: LOG.debug("Getting all File objects via ORM (yeah, it SLOW)") self._existing_branch = old_node.get_all_children() self._existing_branch.insert(0, old_node) # Break the chain of parent-children relations LOG.debug("Make them orphans") for fobj in self._existing_branch: fobj.parent = None update_path = os.path.join(old_node.filepath, old_node.filename) # gimme a string. unicode can't handle strange filenames in paths, so # in case of such, better get me a byte string. It is not perfect # though, since it WILL crash if the update_path would contain some # unconvertable characters. update_path = update_path # refresh objects LOG.debug("Refreshing objects") self._get_all_files() LOG.debug("path for update: %s", update_path) # See, if file exists. If not it would raise OSError exception os.stat(update_path) if not os.access(update_path, os.R_OK | os.X_OK) \ or not os.path.isdir(update_path): LOG.error("Access to %s is forbidden", update_path) raise NoAccessError("Access to %s is forbidden" % update_path) directory = os.path.basename(update_path) path = os.path.dirname(update_path) if not self._recursive(parent, directory, path, 0): return None # update branch # self._session.merge(self._files[0]) LOG.debug("Deleting objects whitout parent: %s", str(self._session.query(File) .filter(File.parent.is_(None)).all())) self._session.query(File).filter(File.parent.is_(None)).delete() self._session.commit() return self._files def _gather_information(self, fobj): """ Try to guess type and gather information about File object if possible """ mimedict = {'audio': self._audio, 'video': self._video, 'image': self._image} extdict = {'.mkv': 'video', # TODO: move this to config/plugin(?) '.rmvb': 'video', '.ogm': 'video', '.ogv': 'video'} fp = os.path.join(fobj.filepath, fobj.filename) mimeinfo = mimetypes.guess_type(fp) if mimeinfo[0]: mimeinfo = mimeinfo[0].split("/")[0] ext = os.path.splitext(fp)[1] if mimeinfo and mimeinfo in mimedict: mimedict[mimeinfo](fobj, fp) elif ext and ext in extdict: mimedict[extdict[ext]](fobj, fp) else: LOG.debug("Filetype not supported %s %s", str(mimeinfo), fp) pass def _audio(self, fobj, filepath): # LOG.warning('audio') return def _image(self, fobj, filepath): # LOG.warning('image') return def _video(self, fobj, filepath): """ Make captures for a movie. Save it under uniq name. """ vid = Video(filepath) fobj.description = vid.get_formatted_tags() def _get_all_files(self): """Gather all File objects""" self._existing_files = self._session.query(File).all() def _mk_file(self, fname, path, parent, ftype=TYPE['file']): """ Create and return File object """ fullpath = os.path.join(path, fname) if ftype == TYPE['link']: fname = fname + " -> " + os.readlink(fullpath) fob = {'filename': fname, 'path': path, 'ftype': ftype} try: fob['date'] = datetime.fromtimestamp(os.stat(fullpath).st_mtime) fob['size'] = os.stat(fullpath).st_size except OSError: # in case of dead softlink, we will have no time and size fob['date'] = None fob['size'] = 0 fobj = self._get_old_file(fob, ftype) if fobj: LOG.debug("found existing file in db: %s", str(fobj)) # TODO: update whole tree sizes (for directories/discs) fobj.size = fob['size'] fobj.filepath = fob['path'] fobj.type = fob['ftype'] else: fobj = File(**fob) # SLOW. Don't do this. Checksums has no value eventually # fobj.mk_checksum() if parent is None: fobj.parent_id = 1 else: fobj.parent = parent self._files.append(fobj) return fobj def _non_recursive(self, parent, fname, path, size): """ Do the walk through the file system. Non recursively, since it's slow as hell. @Arguments: @parent - directory File object which is parent for the current scope @fname - string that hold filename @path - full path for further scanning @size - size of the object """ fullpath = os.path.join(path, fname) parent = self._mk_file(fname, path, parent, TYPE['dir']) parent.size = 0 parent.type = TYPE['dir'] for root, dirs, files in os.walk(fullpath): for dir_ in dirs: pass for file_ in files: self.current_count += 1 stat = os.lstat(os.path.join(root, file_)) parent.size += stat.st_size # TODO: finish that up def _recursive(self, parent, fname, path, size): """ Do the walk through the file system @Arguments: @parent - directory File object which is parent for the current scope @fname - string that hold filename @path - full path for further scanning @size - size of the object """ if self.abort: return False fullpath = os.path.join(path, fname) parent = self._mk_file(fname, path, parent, TYPE['dir']) parent.size = _get_dirsize(fullpath) parent.type = TYPE['dir'] LOG.info("Scanning `%s' [%s/%s]", fullpath, self.current_count, self.files_count) root, dirs, files = next(os.walk(fullpath)) for fname in files: fpath = os.path.join(root, fname) extension = os.path.splitext(fname)[1] self.current_count += 1 LOG.debug("Processing %s [%s/%s]", fname, self.current_count, self.files_count) result = RE_FN_START.match(fname) test_ = False if result and extension in ('.jpg', '.gif', '.png'): startfrom = result.groupdict()['fname_start'] matching_files = [] for fn_ in os.listdir(root): if fn_.startswith(startfrom): matching_files.append(fn_) if len(matching_files) > 1: LOG.debug('found image "%s" in group: %s, skipping', fname, str(matching_files)) test_ = True if test_: continue if os.path.islink(fpath): fob = self._mk_file(fname, root, parent, TYPE['link']) else: fob = self._mk_file(fname, root, parent) existing_obj = self._object_exists(fob) if existing_obj: existing_obj.parent = fob.parent fob = existing_obj else: LOG.debug("gather information for %s", os.path.join(root, fname)) self._gather_information(fob) size += fob.size if fob not in self._existing_files: self._existing_files.append(fob) for dirname in dirs: dirpath = os.path.join(root, dirname) if not os.access(dirpath, os.R_OK | os.X_OK): LOG.info("Cannot access directory %s", dirpath) continue if os.path.islink(dirpath): fob = self._mk_file(dirname, root, parent, TYPE['link']) else: LOG.debug("going into %s", os.path.join(root, dirname)) self._recursive(parent, dirname, fullpath, size) LOG.debug("size of items: %s", parent.size) return True def _get_old_file(self, fdict, ftype): """ Search for object with provided data in dictionary in stored branch (which is updating). Return such object on success, remove it from list. """ for index, obj in enumerate(self._existing_branch): if ftype == TYPE['link'] and fdict['filename'] == obj.filename: return self._existing_branch.pop(index) elif fdict['filename'] == obj.filename and \ fdict['date'] == obj.date and \ ftype == TYPE['file'] and \ fdict['size'] in (obj.size, 0): obj = self._existing_branch.pop(index) obj.size = fdict['size'] return obj elif fdict['filename'] == obj.filename: obj = self._existing_branch.pop(index) obj.size = fdict['date'] return obj return False def _object_exists(self, fobj): """ Perform check if current File object already exists in collection. If so, return first matching one, None otherwise. """ for efobj in self._existing_files: if efobj.size == fobj.size \ and efobj.type == fobj.type \ and efobj.date == fobj.date \ and efobj.filename == fobj.filename: return efobj return None def _get_files_count(self): """return size in bytes""" count = 0 for _, _, files in os.walk(str(self.path)): count += len(files) LOG.debug("count of files: %s", count) return count def _get_dirsize(path): """ Returns sum of all files under specified path (also in subdirs) """ size = 0 for root, _, files in os.walk(path): for fname in files: try: size += os.lstat(os.path.join(root, fname)).st_size except OSError: LOG.warning("Cannot access file %s", os.path.join(root, fname)) LOG.debug("_get_dirsize, %s: %d", path, size) return size