Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
elif isstr(args):
subfiles = [args, ]
else:
# make a copy in case original args are modified
# Not sure whether this really is needed...
subfiles = tuple(arg for arg in args)
for subfile in subfiles:
with zipper.open(subfile, 'r') as handle:
yield subfile, handle
if not args:
self.did_iter_all = True
except KeyError as orig_err:
# Note: do not change text of this message without adjusting
# conditions in except handlers
raise BadOOXML(self.filename,
'invalid subfile: ' + str(orig_err))
except BadZipfile:
raise BadOOXML(self.filename, 'not in zip format')
finally:
if zipper:
zipper.close()
return DOCTYPE_NONE
is_doc = False
is_xls = False
is_ppt = False
try:
for _, elem, _ in parser.iter_xml(FILE_CONTENT_TYPES):
logger.debug(u' ' + debug_str(elem))
try:
content_type = elem.attrib['ContentType']
except KeyError: # ContentType not an attr
continue
is_xls |= content_type.startswith(CONTENT_TYPES_EXCEL)
is_doc |= content_type.startswith(CONTENT_TYPES_WORD)
is_ppt |= content_type.startswith(CONTENT_TYPES_PPT)
except BadOOXML as oo_err:
if oo_err.more_info.startswith('invalid subfile') and \
FILE_CONTENT_TYPES in oo_err.more_info:
# no FILE_CONTENT_TYPES in zip, so probably no ms office xml.
return DOCTYPE_NONE
raise
if is_doc and not is_xls and not is_ppt:
return DOCTYPE_WORD
if not is_doc and is_xls and not is_ppt:
return DOCTYPE_EXCEL
if not is_doc and not is_xls and is_ppt:
return DOCTYPE_POWERPOINT
if not is_doc and not is_xls and not is_ppt:
return DOCTYPE_NONE
logger.warning('Encountered contradictory content types')
return DOCTYPE_MIXED
def iter_files(self, args=None):
""" Find files in zip or just give single xml file """
if self.is_single_xml():
if args:
raise BadOOXML(self.filename, 'xml has no subfiles')
with open(self.filename, 'rb') as handle:
yield None, handle # the subfile=None is needed in iter_xml
self.did_iter_all = True
else:
zipper = None
subfiles = None
try:
zipper = ZipFile(self.filename)
if not args:
subfiles = zipper.namelist()
elif isstr(args):
subfiles = [args, ]
else:
# make a copy in case original args are modified
# Not sure whether this really is needed...
subfiles = tuple(arg for arg in args)
# have a TAG_W_P
for curr_elem in subs:
# check if w:r; parse children to pull out first FLDCHAR/INSTRTEXT
elem = None
if curr_elem.tag in TAG_W_R:
for child in curr_elem:
if child.tag in TAG_W_FLDCHAR or \
child.tag in TAG_W_INSTRTEXT:
elem = child
break
if elem is None:
continue # no fldchar or instrtext in this w:r
else:
elem = curr_elem
if elem is None:
raise ooxml.BadOOXML(filepath,
'Got "None"-Element from iter_xml')
# check if FLDCHARTYPE and whether "begin" or "end" tag
attrib_type = elem.attrib.get(ATTR_W_FLDCHARTYPE[0]) or \
elem.attrib.get(ATTR_W_FLDCHARTYPE[1])
if attrib_type is not None:
if attrib_type == "begin":
level += 1
if attrib_type == "end":
level -= 1
if level in (0, -1): # edge-case; level gets -1
all_fields.append(ddetext)
ddetext = u''
level = 0 # reset edge-case
# concatenate the text of the field, if present:
"""
if self._is_single_xml is not None:
return self._is_single_xml
if is_zipfile(self.filename):
self._is_single_xml = False
return False
# find prog id in xml prolog
match = None
with open(self.filename, 'r') as handle:
match = re.search(OFFICE_XML_PROGID_REGEX, handle.read(1024))
if match:
self._is_single_xml = True
return True
raise BadOOXML(self.filename, 'is no zip and has no prog_id')
def __init__(self, filename, more_info=None):
""" create exception, remember filename and more_info """
super(BadOOXML, self).__init__(
'{0} is not an Office XML file{1}'
.format(filename, ': ' + more_info if more_info else ''))
self.filename = filename
self.more_info = more_info