博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java读取docx_Java读取doc、docx、xls、xlsx、ppt、pptx、pdf文件内容
阅读量:5254 次
发布时间:2019-06-14

本文共 5090 字,大约阅读时间需要 16 分钟。

读取文件信息所需依赖

org.apache.poi

poi

4.1.2

org.apache.poi

poi-scratchpad

4.1.2

org.apache.poi

poi-ooxml

4.1.2

org.apache.pdfbox

pdfbox

2.0.12

org.apache.pdfbox

fontbox

2.0.12

读取doc文件内容

public static String readWord(String name)

{

FileInputStream in;

String text = null;

try

{

in = new FileInputStream(name);

WordExtractor extractor = new WordExtractor(in);

text = extractor.getText();

}

catch (FileNotFoundException e)

{

// TODO Auto-generated catch block

e.printStackTrace();

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return text;

}

读取docx文件内容

public static String readDoc(MultipartFile file) {

if (file.isEmpty())return "";

WordExtractor wordExtractor = null;

try {

InputStream inputStream = file.getInputStream();

wordExtractor = new WordExtractor(inputStream);

} catch (IOException e) {

log.warn(e.toString());

e.printStackTrace();

}

return wordExtractor.getText();

}

读取xls文件内容

public static String readXls(MultipartFile file) {

if (file.isEmpty()) return "";

StringBuilder content = new StringBuilder();

try {

HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());

//获取第一个sheet

HSSFSheet sheet0 = excel.getSheetAt(0);

for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {

HSSFRow row = (HSSFRow) rowIterator.next();

for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {

HSSFCell cell = (HSSFCell) iterator.next();

//根据单元的的类型 读取相应的结果

if (cell.getCellType() == CellType.STRING)

content.append(cell.getStringCellValue() + "\t");

else if (cell.getCellType() == CellType.NUMERIC

|| cell.getCellType() == CellType.FORMULA)

content.append(cell.getNumericCellValue() + "\t");

else

content.append("" + "\t");

}

}

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

log.warn(e.toString());

}

return content.toString();

}

读取xlsx文件内容

public static String readXlsx(MultipartFile file) {

if (file.isEmpty()) return "";

StringBuilder content = new StringBuilder();

try {

XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());

//获取第一个sheet

XSSFSheet sheet0 = excel.getSheetAt(0);

for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {

XSSFRow row = (XSSFRow) rowIterator.next();

for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {

XSSFCell cell = (XSSFCell) iterator.next();

//根据单元格的类型 读取相应的结果

if (cell.getCellType() == CellType.STRING)

content.append(cell.getStringCellValue() + "\t");

else if (cell.getCellType() == CellType.NUMERIC

|| cell.getCellType() == CellType.FORMULA)

content.append(cell.getNumericCellValue() + "\t");

else

content.append("" + "\t");

}

}

} catch (Exception e) {

e.printStackTrace();

log.warn(e.toString());

}

return content.toString();

}

读取pdf文件内容

/**

* 读取 PDF文本内容

*

* @Param: MultipartFile

* @return: pdf文本内容

*/

public static String readPdf(MultipartFile file) {

StringBuilder content = new StringBuilder();

try {

InputStream is = file.getInputStream();

PDFParser parser = new PDFParser(new RandomAccessBuffer(is));

parser.parse();

// 读取文本内容

PDDocument document = parser.getPDDocument();

// 获取页码

int pages = document.getNumberOfPages();

PDFTextStripper stripper = new PDFTextStripper();

// 设置按顺序输出

stripper.setSortByPosition(true);

stripper.setStartPage(1);

stripper.setEndPage(pages);

content.append(stripper.getText(document));

} catch (Exception e) {

e.printStackTrace();

log.warn(e.toString());

}

return content.toString();

}

PDF文件加载有两种方式,无明显差异,方式二代码较简洁:

// 方式一:

InputStream input = null;

input = new FileInputStream( pdfFile );

//加载 pdf 文档

PDFParser parser = new PDFParser(new RandomAccessBuffer(input));

parser.parse();

document = parser.getPDDocument();

// 方式二:

document=PDDocument.load(pdfFile);

读取ppt文件内容

public static String readPPT(MultipartFile file) {

if (file.isEmpty()) return "";

StringBuilder content = new StringBuilder();

try {

InputStream is = file.getInputStream();

HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);

List slides = hslfSlideShow.getSlides();

SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);

for (HSLFSlide slide : slides) {

content.append(slideShowExtractor.getText(slide));

}

slideShowExtractor.close();

} catch (IOException e) {

log.warn(e.toString());

e.printStackTrace();

}

return content.toString();

}

读取pptx文件内容

public static String readPPTX(MultipartFile file) {

if (file.isEmpty()) return "";

StringBuffer content = new StringBuffer();

try {

InputStream is = file.getInputStream();

XMLSlideShow xmlSlideShow = new XMLSlideShow(is);

List slides = xmlSlideShow.getSlides(); //获得每一张幻灯片

for (XSLFSlide slide : slides) {

CTSlide rawSlide = slide.getXmlObject();

CTGroupShape spTree = rawSlide.getCSld().getSpTree();

List spList = spTree.getSpList();

for (CTShape shape : spList) {

CTTextBody txBody = shape.getTxBody();

if (null == txBody) {

continue;

}

List pList = txBody.getPList();

for (CTTextParagraph textParagraph : pList) {

List textRuns = textParagraph.getRList();

for (CTRegularTextRun textRun : textRuns) {

content.append(textRun.getT());

}

}

}

}

xmlSlideShow.close();

} catch (Exception e) {

e.printStackTrace();

}

return content.toString();

}

转载地址:http://jgeav.baihongyu.com/

你可能感兴趣的文章
android 签名
查看>>
vue项目中使用百度统计
查看>>
android:scaleType属性
查看>>
SuperEPC
查看>>
mysql-5.7 innodb 的并行任务调度详解
查看>>
shell脚本
查看>>
Upload Image to .NET Core 2.1 API
查看>>
Js时间处理
查看>>
Java项目xml相关配置
查看>>
三维变换概述
查看>>
第三次作业
查看>>
vue route 跳转
查看>>
【雷电】源代码分析(二)-- 进入游戏攻击
查看>>
Entityframework:“System.Data.Entity.Internal.AppConfig”的类型初始值设定项引发异常。...
查看>>
Linux中防火墙centos
查看>>
mysql新建用户,用户授权,删除用户,修改密码
查看>>
FancyCoverFlow
查看>>
JS博客
查看>>
如何设置映射网络驱动器的具体步骤和方法
查看>>
ASP.NET WebApi 基于OAuth2.0实现Token签名认证
查看>>