Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_rough_doctype(self):
"""Checks all samples, expect either ole files or good ooxml output"""
# map from extension to expected doctype
ext2doc = dict(
docx=ooxml.DOCTYPE_WORD, docm=ooxml.DOCTYPE_WORD,
dotx=ooxml.DOCTYPE_WORD, dotm=ooxml.DOCTYPE_WORD,
xml=(ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_WORD_XML),
xlsx=ooxml.DOCTYPE_EXCEL, xlsm=ooxml.DOCTYPE_EXCEL,
xlsb=ooxml.DOCTYPE_EXCEL, xlam=ooxml.DOCTYPE_EXCEL,
xltx=ooxml.DOCTYPE_EXCEL, xltm=ooxml.DOCTYPE_EXCEL,
pptx=ooxml.DOCTYPE_POWERPOINT, pptm=ooxml.DOCTYPE_POWERPOINT,
ppsx=ooxml.DOCTYPE_POWERPOINT, ppsm=ooxml.DOCTYPE_POWERPOINT,
potx=ooxml.DOCTYPE_POWERPOINT, potm=ooxml.DOCTYPE_POWERPOINT,
ods=ooxml.DOCTYPE_NONE, odt=ooxml.DOCTYPE_NONE,
odp=ooxml.DOCTYPE_NONE,
)
# files that are neither OLE nor xml:
except_files = 'empty', 'text'
except_extns = 'rtf', 'csv', 'zip'
def process_excel_xml(filepath):
""" find dde links in xml files created with excel 2003 or excel 2007+
TODO: did not manage to create dde-link in the 2007+-xml-format. Find out
whether this is possible at all. If so, extend this function
"""
dde_links = []
parser = ooxml.XmlParser(filepath)
for _, elem, _ in parser.iter_xml():
tag = elem.tag.lower()
if tag != 'cell' and not tag.endswith('}cell'):
continue # we are only interested in cells
formula = None
for key in elem.keys():
if key.lower() == 'formula' or key.lower().endswith('}formula'):
formula = elem.get(key)
break
if formula is None:
continue
logger.debug(u'found cell with formula {0}'.format(formula))
match = re.match(XML_DDE_FORMAT, formula)
if match:
dde_links.append(u' '.join(match.groups()[:2]))
return u'\n'.join(dde_links)
def process_xlsx(filepath):
""" process an OOXML excel file (e.g. .xlsx or .xlsb or .xlsm) """
dde_links = []
parser = ooxml.XmlParser(filepath)
for _, elem, _ in parser.iter_xml():
tag = elem.tag.lower()
if tag == 'ddelink' or tag.endswith('}ddelink'):
# we have found a dde link. Try to get more info about it
link_info = []
if 'ddeService' in elem.attrib:
link_info.append(elem.attrib['ddeService'])
if 'ddeTopic' in elem.attrib:
link_info.append(elem.attrib['ddeTopic'])
dde_links.append(u' '.join(link_info))
# binary parts, e.g. contained in .xlsb
for subfile, content_type, handle in parser.iter_non_xml():
try:
logger.info('Parsing non-xml subfile {0} with content type {1}'
.format(subfile, content_type))
# Check for old office formats
try:
doctype = ooxml.get_type(__sessions__.current.file.path)
OOXML_FILE = True
except Exception as exc:
OOXML_FILE = False
# set defaults
XLSX_FILE = False
EXCEL_XML_FILE = False
DOCX_FILE = False
if OOXML_FILE is True:
if doctype == ooxml.DOCTYPE_EXCEL:
XLSX_FILE = True
elif doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
EXCEL_XML_FILE = True
elif doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
DOCX_FILE = True
# Tests to check for valid Office structures.
OLE_FILE = olefile.isOleFile(__sessions__.current.file.path)
XML_FILE = zipfile.is_zipfile(__sessions__.current.file.path)
if OLE_FILE:
ole = olefile.OleFileIO(__sessions__.current.file.path)
elif XML_FILE:
zip_xml = zipfile.ZipFile(__sessions__.current.file.path, 'r')
elif OLD_XML:
pass
elif MHT_FILE:
pass
elif DOCX_FILE:
return process_doc(ole)
with open(filepath, 'rb') as file_handle:
# TODO: here we should not assume this is a file on disk, filepath can be a file object
if file_handle.read(4) == RTF_START:
logger.debug('Process file as rtf')
return process_rtf(file_handle, field_filter_mode)
try:
doctype = ooxml.get_type(filepath)
logger.debug('Detected file type: {0}'.format(doctype))
except Exception as exc:
logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
doctype = None
if doctype == ooxml.DOCTYPE_EXCEL:
logger.debug('Process file as excel 2007+ (xlsx)')
return process_xlsx(filepath)
if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
logger.debug('Process file as xml from excel 2003/2007+')
return process_excel_xml(filepath)
if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
logger.debug('Process file as xml from word 2003/2007+')
return process_docx(filepath)
if doctype is None:
logger.debug('Process file as csv')
return process_csv(filepath)
# could be docx; if not: this is the old default code path
logger.debug('Process file as word 2007+ (docx)')
return process_docx(filepath, field_filter_mode)
# TODO: here we should not assume this is a file on disk, filepath can be a file object
if file_handle.read(4) == RTF_START:
logger.debug('Process file as rtf')
return process_rtf(file_handle, field_filter_mode)
try:
doctype = ooxml.get_type(filepath)
logger.debug('Detected file type: {0}'.format(doctype))
except Exception as exc:
logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
doctype = None
if doctype == ooxml.DOCTYPE_EXCEL:
logger.debug('Process file as excel 2007+ (xlsx)')
return process_xlsx(filepath)
if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
logger.debug('Process file as xml from excel 2003/2007+')
return process_excel_xml(filepath)
if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
logger.debug('Process file as xml from word 2003/2007+')
return process_docx(filepath)
if doctype is None:
logger.debug('Process file as csv')
return process_csv(filepath)
# could be docx; if not: this is the old default code path
logger.debug('Process file as word 2007+ (docx)')
return process_docx(filepath, field_filter_mode)
return process_rtf(file_handle, field_filter_mode)
try:
doctype = ooxml.get_type(filepath)
logger.debug('Detected file type: {0}'.format(doctype))
except Exception as exc:
logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
doctype = None
if doctype == ooxml.DOCTYPE_EXCEL:
logger.debug('Process file as excel 2007+ (xlsx)')
return process_xlsx(filepath)
if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
logger.debug('Process file as xml from excel 2003/2007+')
return process_excel_xml(filepath)
if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
logger.debug('Process file as xml from word 2003/2007+')
return process_docx(filepath)
if doctype is None:
logger.debug('Process file as csv')
return process_csv(filepath)
# could be docx; if not: this is the old default code path
logger.debug('Process file as word 2007+ (docx)')
return process_docx(filepath, field_filter_mode)
return
file_data = __sessions__.current.file.data
if file_data.startswith(b'
return
file_data = __sessions__.current.file.data
if file_data.startswith(b'
return process_xls(filepath)
if is_ppt(filepath):
logger.debug('is ppt - cannot have DDE')
return u''
logger.debug('Process file as word 2003 (doc)')
with olefile.OleFileIO(filepath, path_encoding=None) as ole:
return process_doc(ole)
with open(filepath, 'rb') as file_handle:
# TODO: here we should not assume this is a file on disk, filepath can be a file object
if file_handle.read(4) == RTF_START:
logger.debug('Process file as rtf')
return process_rtf(file_handle, field_filter_mode)
try:
doctype = ooxml.get_type(filepath)
logger.debug('Detected file type: {0}'.format(doctype))
except Exception as exc:
logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
doctype = None
if doctype == ooxml.DOCTYPE_EXCEL:
logger.debug('Process file as excel 2007+ (xlsx)')
return process_xlsx(filepath)
if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
logger.debug('Process file as xml from excel 2003/2007+')
return process_excel_xml(filepath)
if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
logger.debug('Process file as xml from word 2003/2007+')
return process_docx(filepath)
if doctype is None:
logger.debug('Process file as csv')