基于Eclipse的MapReduce项目求解最大值Reduce过程的实现
时间: 2024-05-14 14:12:35 浏览: 8
以下是使用Java将doc文档转换为XML文档并生成文件的示例代码。代码中使用了Apache POI库来读取和操作doc文档,使用DOM库来创建和操作XML文档。
```java
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
public class DocToXmlConverter {
private static final String OUTPUT_ENCODING = "UTF-8";
public static void main(String[] args) {
String inputFilePath = "input.doc";
String outputFilePath = "output.xml";
try {
HWPFDocument doc = new HWPFDocument(new FileInputStream(inputFilePath));
Document xmlDocument = createXmlDocument(doc);
saveXmlDocument(xmlDocument, outputFilePath);
} catch (IOException e) {
e.printStackTrace();
}
}
private static Document createXmlDocument(HWPFDocument doc) {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder;
Document xmlDocument = null;
try {
docBuilder = docFactory.newDocumentBuilder();
xmlDocument = docBuilder.newDocument();
Element rootElement = xmlDocument.createElement("document");
xmlDocument.appendChild(rootElement);
for (int i = 0; i < doc.getRange().numParagraphs(); i++) {
Paragraph paragraph = doc.getRange().getParagraph(i);
Element paragraphElement = createParagraphElement(xmlDocument, paragraph);
rootElement.appendChild(paragraphElement);
}
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
return xmlDocument;
}
private static Element createParagraphElement(Document xmlDocument, Paragraph paragraph) {
Element paragraphElement = xmlDocument.createElement("p");
for (int i = 0; i < paragraph.numCharacterRuns(); i++) {
CharacterRun run = paragraph.getCharacterRun(i);
String text = run.text();
if (text != null && text.length() > 0) {
Map<String, String> attributes = createRunAttributes(run);
Element runElement = xmlDocument.createElement("r");
for (Map.Entry<String, String> entry : attributes.entrySet()) {
runElement.setAttribute(entry.getKey(), entry.getValue());
}
Node textNode = xmlDocument.createTextNode(text);
runElement.appendChild(textNode);
paragraphElement.appendChild(runElement);
}
}
return paragraphElement;
}
private static Map<String, String> createRunAttributes(CharacterRun run) {
Map<String, String> attributes = new HashMap<>();
if (run.isBold()) {
attributes.put("b", "true");
}
if (run.isItalic()) {
attributes.put("i", "true");
}
if (run.isStrikeThrough()) {
attributes.put("strike", "true");
}
if (run.isUnderline()) {
attributes.put("u", "true");
}
return attributes;
}
private static void saveXmlDocument(Document xmlDocument, String outputFilePath) {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer;
try {
transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, OUTPUT_ENCODING);
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
DOMSource source = new DOMSource(xmlDocument);
Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outputFilePath)), OUTPUT_ENCODING);
StreamResult result = new StreamResult(writer);
transformer.transform(source, result);
} catch (TransformerConfigurationException e) {
e.printStackTrace();
} catch (TransformerException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
```
在这个示例代码中,我们首先使用Apache POI库读取doc文档并创建一个HWPFDocument对象。然后,我们使用DOM库创建一个空的XML文档对象,并将其根元素设置为document。接下来,我们遍历文档中的每个段落,并为每个段落创建一个XML元素。对于每个段落中的每个字符运行,我们都创建一个XML元素并为其设置文本和属性(例如粗体、斜体等)。最后,我们将XML文档写入输出文件。
这个示例代码还兼容特殊符号,因为DOM库会自动转义它们。例如,如果文档中包含“<”字符,它会被转换为“<”序列。这确保了生成的XML文档是有效的,并且可以正确解析。