java词汇分析器:区分机器代码和数字行

4c8rllxm  于 2021-07-06  发布在  Java
关注(0)|答案(1)|浏览(307)

我正在开发一个词法分析器,它将读取cm汇编并创建标记,而且在大多数情况下,它运行得非常完美。我只有两个小问题。首先也是最重要的是,我很难区分机器代码和行号。因为我没有弄错,如果第一个字符是一个数字,并且它的长度小于2,我将它指定为行号。这里的问题是,这也是机器代码的描述。实际上,行号可以超过2,机器代码不一定要以数字开头,因为它是十六进制的。我很难提出满足这些条件的条件。第二个问题我有一个非常小的一个,当谈到评论,它会读每一个字符,除了最后一个。请在下面找到输入示例和我的代码。非常感谢大家的帮助,非常感谢!

  1. import java.util.List;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileNotFoundException;
  5. import java.io.FileReader;
  6. import java.io.IOException;
  7. import java.util.ArrayList;
  8. public class Lexer
  9. {
  10. private static File srcFile = null ;
  11. private static String srcFilename = "<srcFilename>";
  12. public static enum Type
  13. {
  14. //This Assembly code has 5 token types (I am a little confused about this, the documentation released does not specify or at least
  15. //go into great detail about about addresses and offsets, and even mentions that they are excluded from Cm Assembly, so I'm not sure
  16. //what to do for those
  17. MNUMONIC_NAMES, LABELS, ADDRESSES, OFFSETS, COMMENTS, LINE_NUMBER;
  18. }
  19. //This class creates an object of type token
  20. public static class Token
  21. {
  22. public final Type t;
  23. public final String c;
  24. //constructor (set type)
  25. public Token(Type t, String c)
  26. {
  27. this.t = t;
  28. this.c = c;
  29. }
  30. //toString
  31. public String toString()
  32. {
  33. if(t == Type.MNUMONIC_NAMES)
  34. {
  35. return "MNUMONIC_NAMES<" + c + ">";
  36. }
  37. if(t == Type.LABELS)
  38. {
  39. return "LABELS<" + c + ">";
  40. }
  41. if(t == Type.ADDRESSES)
  42. {
  43. return "ADDRESSES<" + c + ">";
  44. }
  45. if(t == Type.OFFSETS)
  46. {
  47. return "OFFSETS<" + c + ">";
  48. }
  49. if(t == Type.COMMENTS)
  50. {
  51. return "COMMENTS<" + c + ">";
  52. }
  53. if(t == Type.LINE_NUMBER)
  54. {
  55. return "LINE_NUMBER<" + c + ">";
  56. }
  57. return t.toString();
  58. }
  59. }
  60. //Given a String and an index, get the word/atom starting at index (this will be used for mnemonics and labels,
  61. //since they are both single words
  62. public static String getAtom(String s, int i)
  63. {
  64. int j = i;
  65. for(;j < s.length(); )
  66. {
  67. //while character is a letter, continue
  68. if(Character.isLetter(s.charAt(j)) || Character.isDigit(s.charAt(j)) )
  69. {
  70. j++;
  71. }
  72. else
  73. {
  74. return s.substring(i, j);
  75. }
  76. }
  77. return s.substring(i,j);
  78. }
  79. //Given a String and an index, get the comment starting at index
  80. //I am having trouble with this one, it returns every word except the last letter of the last word
  81. //If I remove the -1 from the length, it goes out of bounds.
  82. public static String getComment(String s, int i)
  83. {
  84. return s.substring( i , (s.length()-1) );
  85. }
  86. //method creates and returns a List of Tokens
  87. public static List<Token> lex(String input)
  88. {
  89. List<Token> result = new ArrayList<Token>();
  90. for(int i = 0;i < input.length();)
  91. {
  92. switch(input.charAt(i))
  93. {
  94. //case comment
  95. case ';':
  96. //System.out.println(input);
  97. String comment = getComment(input, i);
  98. i+=comment.length();
  99. result.add(new Token(Type.COMMENTS, comment));
  100. //if it is a number
  101. case '0': case '1': case '2': case '3': case '4':
  102. case '5': case '6': case '7': case '8': case '9':
  103. String number = getAtom(input, i);
  104. i+=number.length();
  105. if(number.length() < 4)
  106. {
  107. result.add(new Token(Type.LINE_NUMBER, number));
  108. }
  109. else
  110. {
  111. result.add(new Token(Type.ADDRESSES, number));
  112. }
  113. //if not a comment it is either a label or mnemonic (still uncertain about addresses and offsets)
  114. default:
  115. //if white space, continue
  116. if(Character.isWhitespace(input.charAt(i)))
  117. {
  118. i++;
  119. }
  120. //now check for the atom
  121. else
  122. {
  123. String atom = getAtom(input, i);
  124. i += atom.length();
  125. //check if it is a mnemonic (as specified by doc 3, pages 7 -> 8)
  126. if(Check.isMnem(atom))
  127. {
  128. result.add(new Token(Type.MNUMONIC_NAMES, atom));
  129. }
  130. //else it is a label
  131. else
  132. {
  133. result.add(new Token(Type.LABELS, atom));
  134. }
  135. }
  136. break;
  137. }
  138. }
  139. //return list
  140. return result;
  141. }
  142. //main just to test functionality, will run in terminal/command line
  143. public static void main(String[] args) throws IOException {
  144. /*
  145. if(args.length < 1) {
  146. System.out.println("Usage: java Lexer \"((some Scheme) (code to) lex)\".");
  147. return;
  148. }
  149. if (args[0] != null)
  150. {
  151. //check <src>
  152. srcFilename = args[0];
  153. System.out.println("charcount: srcFilename '" + srcFilename + "'");
  154. srcFile = new File(srcFilename);
  155. if(!srcFile.canRead())
  156. {
  157. System.out.println("charcount: cannot open srcFile '" + srcFilename + "'");
  158. return;
  159. }
  160. }
  161. else
  162. {
  163. System.out.println("charcount: [OK] srcFilename = '" + srcFilename + "'");
  164. }
  165. */
  166. srcFilename = "C:\\Users\\abdcg\\Desktop\\School\\Concordia\\Semester 4\\SOEN 341\\Project B\\Sprint 1\\Lexer test\\Test 2.txt";
  167. srcFile = new File(srcFilename);
  168. //Scanner scanny = new Scanner(srcFile);
  169. FileReader fr = new FileReader(srcFile);
  170. BufferedReader br = new BufferedReader(fr);
  171. String line;
  172. while((line = br.readLine()) != null)
  173. {
  174. List<Token> tokens = lex(line);
  175. for(Token t : tokens) {
  176. System.out.println(t);
  177. }
  178. }
  179. //while scanner hasNext(), send the entire line to lex
  180. /*
  181. while(scanny.hasNext())
  182. {
  183. List<Token> tokens = lex(scanny.nextLine());
  184. for(Token t : tokens) {
  185. System.out.println(t);
  186. }
  187. }
  188. */
  189. }
  190. }
  1. Line Addr Machine Code Label Assembly Code Comments
  2. 1 0000 00 halt
  3. 2 0001 01 pop
  4. 3 0002 02 dup
  5. 4 0003 03 exit
  6. 5 0004 04 ret
  7. 6 0005 0C not
  8. 7 0006 0D and
  9. 8 0007 0E or
  10. 9 0008 0F xor
  11. 10 0009 10 neg
  12. 11 000A 11 inc
  13. 12 000B 12 dec
  14. 13 000C 13 add
  15. 14 000D 14 sub
  16. 15 000E 15 mul
  17. 16 000F 16 div
  18. 17 0010 17 rem
  19. 18 0011 18 shl
  20. 19 0012 19 shr
  21. 20 0013 1A teq
  22. 21 0014 1B tne
  23. 22 0015 1C tlt
  24. 23 0016 1D tgt
  25. 24 0017 1E tle
  26. 25 0018 1F tge
  27. 26 0019 00 halt
rqqzpn5f

rqqzpn5f1#

就我所知,你在问行号、地址和机器代码之间是否应该有更多的区别——除了彼此相差1,一个是十进制的,另一个是十六进制的。
行号是用户输入文件中的行号,我认为这很简单。
addr应该在行首打印当前值,并在汇编代码将指令输入到输出时前进。  考虑以下条件(如果系统允许):

  1. label1, instruction # normal line, Addr advances for next line printed
  2. label2, # label only line: Addr does not advance after this
  3. instruction # instruction only: Addr does advance again
  4. <blank> # Addr does not advance, just as with label,
  5. br label # multi byte instruction so, Addr advances by more than 1
  6. .text # sample directive: changes Addr to current end of text

机器代码应该是与行上的装配指令相对应的十六进制值。  对于多字节指令,应该打印所有字节。

相关问题