Java获取一个文本文件的编码格式的实现思路

文本文件是我们在windows平台下常用的一种文件格式,这种格式会随着操作系统的语言不同,而出现其默认的编码不同

那么如何使用程序获取“文本文件”的编码方式呢?

文件编码的格式决定了文件可存储的字符类型,所以得到文件的类型至关重要

下文笔者讲述获取一个文本文件的格式信息的方法分享,如下所示:

实现思路:

通过获取文件流的前3个字节
判断其值的方式,即可获取文本文件的编码方式

例:

package com.java265.other;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
 
public class Test {
 /*
 * java265.com 获取文本文件的编码方式
 * 
 **/
 
 public static void main(String[] args) {
 File file = new File("E://person/java265.com/java.txt");
 System.out.println(GetEncoding(file));
 }
 public static String GetEncoding(File file)
 {
 String charset = "GBK";
 byte[] first3Bytes = new byte[3];
 try {
 boolean checked = false; 
 InputStream is = new FileInputStream(file);
 int read = is.read(first3Bytes, 0, 3);
 
 if (read == -1)
 return charset;
 if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
 charset = "UTF-16LE";
 checked = true;
 } else if (first3Bytes[0] == (byte) 0xFE
 && first3Bytes[1] == (byte) 0xFF) {
 charset = "UTF-16BE";
 checked = true;
 } else if (first3Bytes[0] == (byte) 0xEF
 && first3Bytes[1] == (byte) 0xBB
 && first3Bytes[2] == (byte) 0xBF) {
 charset = "UTF-8";
 checked = true;
 }else if (first3Bytes[0] == (byte) 0xA
 && first3Bytes[1] == (byte) 0x5B
 && first3Bytes[2] == (byte) 0x30) {
 charset = "UTF-8";
 checked = true;
 }else if (first3Bytes[0] == (byte) 0xD
 && first3Bytes[1] == (byte) 0xA
 && first3Bytes[2] == (byte) 0x5B) {
 charset = "GBK";
 checked = true;
 }else if (first3Bytes[0] == (byte) 0x5B
 && first3Bytes[1] == (byte) 0x54
 && first3Bytes[2] == (byte) 0x49) {
 charset = "windows-1251";
 checked = true;
 }
 //bis.reset();
 InputStream istmp = new FileInputStream(file);
 if (!checked) {
 int loc = 0;
 while ((read = istmp.read()) != -1) {
 loc++;
 if (read >= 0xF0)
 break;
 if (0x80 <= read && read <= 0xBF)
 break;
 if (0xC0 <= read && read <= 0xDF) {
 read = istmp.read();
 if (0x80 <= read && read <= 0xBF)
 continue;
 else
 break;
 } else if (0xE0 <= read && read <= 0xEF) {
 read = istmp.read();
 if (0x80 <= read && read <= 0xBF) {
 read = istmp.read();
 if (0x80 <= read && read <= 0xBF) {
 charset = "UTF-8";
 break;
 } else
 break;
 } else
 break;
 }
 }
 }
 is.close();
 istmp.close();
 } catch (Exception e) {
 e.printStackTrace();
 }
 return charset; 
 }
}
作者:java未来王者原文地址:https://www.cnblogs.com/javalove2022/archive/2022/09/21/16717560.html

%s 个评论

要回复文章请先登录注册