In this Leetcode UTF-8 Validation problem solution, You have given an integer array of data representing the data, return whether it is a valid UTF-8 encoding.

A character in UTF8 can be from 1 to 4 bytes long, subjected to the following rules:

  1. For a 1-byte character, the first bit is a 0, followed by its Unicode code.
  2. For an n-bytes character, the first n bits are all one's, the n + 1 bit is 0, followed by n - 1 byte with the most significant 2 bits being 10.

Leetcode UTF-8 Validation problem solution

Problem solution in Python.

def validUtf8(self, data: List[int], i=0) -> bool:
        def check_rem(i,total):
            return all(i<len(data) and (data[i]>>6)==0b10 for i in range(i,i+total))
        while i<len(data):
            if data[i]>>7==0: i+=1
            elif data[i]>>5==0b110 and check_rem(i+1,1): i+=2
            elif data[i]>>4==0b1110 and check_rem(i+1,2): i+=3
            elif data[i]>>3==0b11110 and check_rem(i+1,3): i+=4
            else: return False
        return True



Problem solution in Java.

class Solution {
    public boolean validUtf8(int[] arr) {
        int count = 0;
        for(int i=0;i<arr.length;i++){
            int val = arr[i];
            if(count != 0){
                if((val >> 6) == 0b10){
                    count--;
                }else{
                    return false;
                }   

            } else if(count == 0){ 
                if((val >> 7) == 0b0){   
                    count = 0;
                }else if((val >> 5) == 0b110){
                    count = 1;
                }else if((val >> 4) == 0b1110){
                    count = 2;
                }else if((val >> 3) == 0b11110){
                    count =3;
                }else {
                    return false;
                }
            }
        }
        if(count != 0){
            return false;
        }else{
            return true;
        }
    }
}


Problem solution in C++.

class Solution {
public:
int csb(int n){
    int curr=7;
    int msb=0;
    while(curr>=0){
        if((n&(1<<curr))>0){
            msb++;
        }
        else{
            return msb;
        }
        curr--;
    }
    return msb;
}
bool validUtf8(vector<int>& data) {
    if(csb(data[0])==1){
        return false;
    }
    int c=0;
    for(int i=0;i<data.size();i++){
        int cb=csb(data[i]);
        if(cb>4){
            return false;
        }
        if(cb==1){
            if(c==0){
                return false;
            }
            else{
                c--;
                if(c<0){
                    return false;
                }
            }
        }
        else if(cb==0){
            if(c!=0)
            return false;
        }
        else{
            if(c!=0){
                return false;
            }
            else{
                c=cb-1;
            }
        }
    }
    if(c==0){
        return true;
    }
    return false;
}
};


Problem solution in C.

bool validUtf8(int* data, int dataSize)
{
    unsigned short bytes = 0;
    
    for (unsigned int i = 0; i < dataSize; i++)
    {
        if (bytes == 0)
        {
            if ((data[i] >> 7 & 1) == 0)
                bytes = 0;
            else if (data[i] >> 6 & 1 && (data[i] >> 5 & 1) == 0)
                bytes = 1;
            else if (data[i] >> 6 & 1 && data[i] >> 5 & 1 && (data[i] >> 4 & 1) == 0)
                bytes = 2;
            else if (data[i] >> 6 & 1 && data[i] >> 5 & 1 && data[i] >> 4 & 1 && (data[i] >> 3 & 1) == 0)
                bytes = 3;
            else
                return false;
        }
        else
        {
            if (data[i] >> 7 & 1 && (data[i] >> 6 & 1) == 0)
                bytes--;
            else
                return false;
        }
    }
    
    return bytes == 0;
}