1500字范文,内容丰富有趣,写作好帮手!
1500字范文 > 上传doc pdf ppt png jpg html文件并解析内容

上传doc pdf ppt png jpg html文件并解析内容

时间:2023-02-18 00:54:19

相关推荐

上传doc pdf ppt png jpg html文件并解析内容

依赖:

<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>2.0.12</version></dependency>

controller:

@DataLog(operationName = "批量新增人才", operationDesc = "上传文件", methodType = MethodTypeEnum.ADD_TYPE)@ApiOperation(value = "批量新增人才", notes = "批量新增人才")@PostMapping(Urls.RecA01.uploadFileAddA01)@ApiImplicitParams({@ApiImplicitParam(name = "state", value = "人才库类型", required = true)})public JsonObject<Object> uploadFileAddA01(@RequestParam(value = "file") MultipartFile[] file, String state, HttpServletRequest request) throws IOException {String result = "";Map<String, String> param = new HashMap<>();param.put("state", state);userDir = fileConfigProperties.getResources();// 存放文件目录//TODO 此处获取了配置文件信息的值 需在配置文件新增属性String folderSources = userDir + File.separator + "temp" + File.separator + "rckfile";// 设置日期为文件夹名称SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss");String folderName = df.format(new Date()).toString();String folderPath = folderSources + File.separator + folderName;File rmbFile = new File(folderPath);// 文件夹如果不存在就创建该文件夹if (!rmbFile.exists()) {rmbFile.mkdirs();}for (int i = 0; file != null && i < file.length; i++) {// 获得文件名:String name = file[i].getOriginalFilename();// 获得文件名,不带后缀String filename = name.substring(0, name.lastIndexOf('.'));// 获得输入流:InputStream streamList = file[i].getInputStream();File newPhotoFile = new File(folderPath + File.separator + name);OutputStream out = new FileOutputStream(newPhotoFile);// 保存文件boolean writeFlag = this.write(streamList, out);if (writeFlag) {out.flush();out.close();}}// 处理文件夹中文件数据result = recA01Service.uploadFileAddA01(folderPath, param, request);return new JsonSuccessObject<>(result);}public boolean write(InputStream in, OutputStream out) {boolean flag = true;int BUFSIZE = 65536;int s;try {byte[] buf = new byte[BUFSIZE];while ((s = in.read(buf)) > -1) {out.write(buf, 0, s);}} catch (IOException e) {flag = false;e.printStackTrace();logger.error("异常信息:" + e.getMessage());}return flag;}

impl:

@Value("${ocrServe.url}")private String ocrUrl;public String getocrUrl() {return ocrUrl;}@Overridepublic String uploadFileAddA01(String path, Map<String, String> param, HttpServletRequest request) throws IOException {// 获取文件夹下文件File file = new File(path);File[] files = file.listFiles();// 循环处理文件for (File f : files) {//文件名称String fileName = f.getName();//文件类型String suffix = fileName.substring(fileName.lastIndexOf('.') + 1);//人才库类型String state = param.get("state");String text = "";// 解析简历if (("doc").equals(suffix) || ("docx").equals(suffix)) {try {if (("doc").equals(suffix)) {// 根据文件流获得文档对象InputStream is = new FileInputStream(f);WordExtractor re = new WordExtractor(is);text = re.getText();re.close();} else if (("docx").equals(suffix)) {OPCPackage opcPackage = POIXMLDocument.openPackage(f.getPath());POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);text = extractor.getText();extractor.close();}} catch (Exception e1) {e1.printStackTrace();}} else if (("pdf").equals(suffix)) {PDDocument document = null;try {document = PDDocument.load(f);int pageSize = document.getNumberOfPages();// 一页一页读取for (int i = 0; i < pageSize; i++) {// 文本内容PDFTextStripper stripper = new PDFTextStripper();// 设置按顺序输出stripper.setSortByPosition(true);stripper.setStartPage(i + 1);stripper.setEndPage(i + 1);text = text + stripper.getText(document);}} catch (IOException e) {} finally {document.close();}} else if (("htm").equals(suffix) || ("html").equals(suffix)) {// 获取HTML文件流StringBuffer htmlSb = new StringBuffer();try {BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));while (br.ready()) {htmlSb.append(br.readLine());}br.close();} catch (FileNotFoundException e) {e.printStackTrace();}// HTML文件字符串text = htmlSb.toString();} else if (("png").equals(suffix) || ("jpg").equals(suffix) || ("jpeg").equals(suffix)) {// post请求 http://192.168.4.188:8869/paddle_ocrHttpClient client = new HttpClient();PostMethod postMethod = new PostMethod(getocrUrl());try {// FilePart:用来上传文件的类,file即要上传的文件FilePart fp = new FilePart("file", f);Part[] parts = {fp};// 对于MIME类型的请求,httpclient建议全用MulitPartRequestEntity进行包装MultipartRequestEntity mre = new MultipartRequestEntity(parts, postMethod.getParams());postMethod.setRequestEntity(mre);// 由于要上传的文件可能比较大 , 因此在此设置最大的连接超时时间client.getHttpConnectionManager().getParams().setConnectionTimeout(50000);int status = client.executeMethod(postMethod);if (status == HttpStatus.SC_OK) {// 获取返回数据InputStream inputStream = postMethod.getResponseBodyAsStream();BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));//<1>创建字节数组输出流,用来输出读取到的内容ByteArrayOutputStream baos = new ByteArrayOutputStream();//<2>创建缓存大小byte[] buffer = new byte[1024]; // 1KB//每次读取到内容的长度int len = -1;//<3>开始读取输入流中的内容while ((len = inputStream.read(buffer)) != -1) { //当等于-1说明没有数据可以读取了baos.write(buffer, 0, len); //把读取到的内容写到输出流中}//<4> 把字节数组转换为字符串String json = baos.toString();// 获取"text": "女,17302731183", "text_region":中间的信息String regEx = "(\"text\": \")(.*?)(\", \"text_region\")";Pattern p = pile(regEx);Matcher m = p.matcher(json);while (m.find()) {text = text + m.group(2);}}} catch (Exception e) {e.printStackTrace();} finally {// 释放连接postMethod.releaseConnection();}} else if (("ppt").equals(suffix) || ("pptx").equals(suffix)) {StringBuilder sb = new StringBuilder();FileInputStream in = new FileInputStream(f.getPath());try {XMLSlideShow xmlSlideShow = new XMLSlideShow(in);List<XSLFSlide> slides = xmlSlideShow.getSlides();for (XSLFSlide slide : slides) {CTSlide rawSlide = slide.getXmlObject();CTGroupShape gs = rawSlide.getCSld().getSpTree();CTShape[] shapes = gs.getSpArray();for (CTShape shape : shapes) {CTTextBody tb = shape.getTxBody();if (null == tb) {continue;}CTTextParagraph[] paras = tb.getPArray();for (CTTextParagraph textParagraph : paras) {CTRegularTextRun[] textRuns = textParagraph.getRArray();for (CTRegularTextRun textRun : textRuns) {sb.append(textRun.getT());}}}}text = sb.toString();xmlSlideShow.close();} catch (Exception e) {e.printStackTrace();}}// 提取姓名String name = getName(text);// 提取手机号String phone = getPhoneNo(text);// 提取邮箱String email = getEmail(text);if (name != null && name != "") {// 添加人才库方法this.saveRecA01(name, phone, email, null, state, request);} else {return "该简历中不存在用户姓名!";}}return "导入成功";}

application.yml

提取姓名方法,不确定人名是一位还是两位的情况

public static String getName(String str) {// 定义姓名为开头String start = "(姓名)([\\s\\S]*)";// 定义姓氏开头的正则String surname = "(王|李|张|刘|陈|杨|黄|赵|吴|周|徐|孙|马|朱|胡|郭|何|高|林|罗|郑|梁|谢|宋|唐|许|韩|冯|邓|曹|彭|曾" +"|养|须|丰|巢|蒯|相|后|红|权逯|盖益|桓|公|万俟|司马|上官|夏侯|诸葛|闻人|东方|赫连|皇甫|尉迟|公羊|澹台" +"|公冶|宗政|濮阳|淳于|单于|太叔|申屠|公孙|仲孙|轩辕|令狐|钟离|宇文|长孙|慕容|鲜于|闾丘|司徒|司空|亓官" +"|司寇|仉|督|子车|颛孙|端木|巫马|公西|漆雕|乐正|壤驷|公良|拓跋|夹谷|宰父|谷粱|法|汝|钦|段干|百里|东郭" +"|南门|呼延|归海|羊舌|微生|帅|缑|亢|况|郈|琴|梁丘|左丘|东门|西门|佘|佴|伯|赏|南宫|墨|哈|谯" +"|肖|田|董|袁|潘|于|蒋|蔡|余|杜|叶|程|苏|魏|吕|丁|任|沈|姚|卢|姜|崔|钟|谭|陆|汪|范|金|石|廖|贾|夏|韦|傅" +"|方|白|邹|孟|熊|秦|邱|江|尹|薛|闫|段|雷|侯|龙|史|黎|贺|顾|毛|郝|龚|邵|万|钱|覃|武|戴|孔|汤|庞|樊|兰|殷" +"|施|陶|洪|翟|安|颜|倪|严|牛|温|芦|季|俞|章|鲁|葛|伍|申|尤|毕|聂|柴|焦|向|柳|邢|岳|齐|沿|梅|莫|庄|辛|管" +"|祝|左|涂|谷|祁|时|舒|耿|牟|卜|路|詹|关|苗|凌|费|纪|靳|盛|童|欧|甄|项|曲|成|游|欧阳|裴|席|卫|查|屈|鲍|位" +"|覃|霍|翁|隋|植|甘|景|薄|单|包|司|柏|宁|柯|阮|桂|闵|阳|解|强|丛|华|车|冉|房|边|辜|吉|饶|刁|瞿|戚|丘" +"|古|米|池|滕|晋|苑|邬|臧|畅|宫|来|嵺|苟|全|褚|廉|简|娄|盖|符|奚|木|穆|党|燕|郎|邸|冀|谈|姬|屠|连|郜|晏" +"|栾|郁|商|蒙|计|喻|揭|窦|迟|宇|敖|糜|鄢|冷|卓|花|艾|蓝|都|巩|稽|井|练|仲|乐|虞|卞|封|竺|冼|原|官|衣|楚" +"|佟|栗|匡|宗|应|台|巫|鞠|僧|桑|荆|谌|银|扬|明|沙|薄|伏|岑|习|胥|保|和|蔺|水|云|昌|凤|酆|常|皮|康|元|平" +"|萧|湛|禹|无|贝|茅|麻|危|骆|支|咎|经|裘|缪|干|宣|贲|杭|诸|钮|嵇|滑|荣|荀|羊|於|惠|家|芮|羿|储|汲|邴|松" +"|富|乌|巴|弓|牧|隗|山|宓|蓬|郗|班|仰|秋|伊|仇|暴|钭|厉|戎|祖|束|幸|韶|蓟|印|宿|怀|蒲|鄂|索|咸|籍|赖|乔" +"|阴|能|苍|双|闻|莘|贡|逢|扶|堵|宰|郦|雍|却|璩|濮|寿|通|扈|郏|浦|尚|农|别|阎|充|慕|茹|宦|鱼|容|易|慎|戈" +"|庚|终|暨|居|衡|步|满|弘|国|文|寇|广|禄|阙|东|殴|殳|沃|利|蔚|越|夔|隆|师|厍|晃|勾|融|訾|阚|那|空|毋|乜" +"|笪|年|爱|仝|代)";// 中文正则String type = "([\u4E00-\u9FA5])";// 判断有一个字还是两个字// 取到姓名及数据for (int i = 0; i < 2; i++) {String regEx = "";if (i == 0) {regEx = start + surname + type + type;} else {regEx = start + surname + type;}Pattern p = pile(regEx);Matcher m = p.matcher(str);while (m.find()) {// 姓名:张三String personName = m.group();// 取到数据String name = "";if (personName != null) {String regEx1 = "";if (i == 0) {regEx1 = surname + type + type;} else {regEx1 = surname + type;}Pattern p1 = pile(regEx1);Matcher m1 = p1.matcher(personName);while (m1.find()) {name = m1.group();return name;}}}}return null;}

提取手机号

public static String getPhoneNo(String text) {//手机号正则String regExp = "(1[3-9]\\d{9})";Pattern phonePattern = pile(regExp);Matcher matcher = phonePattern.matcher(text);while (matcher.find()) {return matcher.group();}return null;}

提取邮箱

public static String getEmail(String text) {//邮箱正则String regex = "[a-za-z0-9_-]+@\\w+\\.[a-z]+(]+)?";Pattern phonePattern = pile(regex);Matcher matcher = phonePattern.matcher(text);while (matcher.find()) {return matcher.group();}return null;}

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。