You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/third_party/pymmseg-cpp/mmseg/mmseg-cpp/word.h

41 lines
1.0 KiB

#ifndef _WORD_H_
#define _WORD_H_
#include <climits>
#include <cstring>
#include "memory.h"
namespace rmmseg {
const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
struct Word {
unsigned char nbytes; /* number of bytes */
char length; /* number of characters */
unsigned short freq;
char text[word_embed_len];
};
/**
* text: the text of the word.
* length: number of characters (not bytes).
* freq: the frequency of the word.
*/
inline Word *make_word(const char *text,
int length = 1,
int freq = 0,
int nbytes = -1) {
if (freq > USHRT_MAX) freq = USHRT_MAX; /* avoid overflow */
if (nbytes == -1) nbytes = static_cast<int>(std::strlen(text));
Word *w = static_cast<Word *>(
pool_alloc(sizeof(Word) + nbytes + 1 - word_embed_len));
w->nbytes = nbytes;
w->length = length;
w->freq = freq;
std::strncpy(w->text, text, nbytes);
w->text[nbytes] = '\0';
return w;
}
}
#endif /* _WORD_H_ */