###################################### # Parse bookmarks exported from ffox in JSON format # in the html format used by emacs-w3m, optionally merging # with an existing emacs-w3m bookmark file. Note that nested # folders in the fffox bookmarks are flattened, as emacs-w3m # bookmark categories are all on one level. # # Will cope with the current (Nov 2009) bug in ffox which exports # badly formed JSON, but if other parse errors occur run the JSON # through a validator. # # Usage: # $ python ffox2w3m.py FFOX_JSON_FILE [EMACS_W3M_FILE] > NEW_BKMKS.HTML # # v0.1 # # Tested with ffox 3.0 # # bugs etc to jim@sdf-eu.org # ###################################### import sgmllib import json #ignore these folders in the ffox bookmarks ignore = ['Recently Bookmarked' , 'Recent Tags' , 'Get Bookmark Add-ons' , 'Bookmarks Toolbar'] class JsonBkmks(object): """A class which parses json bookmarks from firefox and store the bookmarks as a dict where the keys are folder names and the values are lists of (uri, name) pairs""" def __init__(self, path): self.bkmks = {} s = self.open_and_clean(path) try: self.d = json.loads(s) except ValueError: print 'Could not parse',path,'as JSON' sys.exit(2) self.parse(self.d) def open_and_clean(self, p): """bug in ffox json export means we need to remove the final comma, equivalent to sed s/},]}/}]}/ """ f = open(p, 'r') s = ''.join(f.readlines()).replace('},]}', '}]}') f.close() return s def parse(self, d): t = d['title'] if ignore.count(t) == 0: if 'children' in d: links = [self.parse(c) for c in d['children'] if self.parse(c)] if links: self.bkmks[t] = links elif 'uri' in d: return (d['uri'],t) class HtmlBkmks(sgmllib.SGMLParser): "A class which parses the bookmark.html format of emacs-w3m." def __init__(self, path, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.links = [] self.tmpuri = '' self.inh2 = False self.ina = False self.folder_name = '' self.__bkmks = {} f = open(path, 'r') self.parse(''.join(f.readlines())) f.close() def getbkmks(self): return self.__bkmks def setbkmks(self): print 'bkmks is read only' bkmks = property(getbkmks, setbkmks) def parse(self, s): self.feed(s) self.close() def start_h2(self, attributes): self.inh2 = True def end_h2(self): self.inh2 = False def handle_data(self, data): if self.inh2: self.folder_name = data elif self.ina: self.links.append((self.tmpuri, data)) def start_a(self, attributes): "Process a hyperlink and its 'attributes'." self.ina = True for name, value in attributes: if name == 'href': self.tmpuri = value def end_a(self): self.ina = False def start_ul(self, attr): pass def end_ul(self): self.__bkmks[self.folder_name] = self.links self.links = [] def parse_ffox(path): return JsonBkmks(path).bkmks def parse_w3m(path): return HtmlBkmks(path).bkmks # helper functions def merge(d1, d2): """ Append the contents of d2 to d1. Where a key, k, of d2 exists in d1, don't overwrite d1[k] but append d2[k] to it""" d3 = {} for (k,v) in d1.items(): if v: d3[k] = v if k in d2: for (uri,name) in d2[k]: if not fstin(uri,d3[k]): d3[k].append((uri, name)) del d2[k] d3.update(d2) return d3 def fstin(v, xs): "Return true if v is the first value in one of the pairs in xs" for (x,y) in xs: if x == v: return True return False def out(d): "output the html" print """Bookmarks

Bookmarks

""" keys = d.keys() keys.sort() for k in keys: print "

%s

" % k print '""" print """ """ def usage(): print """bkmks FFOX_JSON_FILE [W3M_HTML_FILE]""" if __name__ == '__main__': import sys if len (sys.argv) < 2: usage() sys.exit(2) bkmks = parse_ffox(sys.argv[1]) if len(sys.argv) == 3: w3m = parse_w3m(sys.argv[2]) bkmks = merge(bkmks, w3m) out(bkmks)